diff --git a/CMakeLists.txt b/CMakeLists.txt
index 528b9e865..90b641733 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -109,64 +109,67 @@ option_with_default(CMAKE_BUILD_TYPE "Build type (Release or Debug)" Release)
 
 ### compiler-only
 option_with_print(LIBINT2_BUILD_LIBRARY_AS_SUBPROJECT
-  "[EXPERT] Build generated library as a subproject: if FALSE will configure and build separately" OFF)
+        "[EXPERT] Build generated library as a subproject: if FALSE will configure and build separately" OFF)
 
 ### library-only
 option_with_print(LIBINT2_REQUIRE_CXX_API
-  "C++11 Libint API: define library targets + test (requires Eigen3, Boost is optional but strongly recommended)" ON)
+        "C++11 Libint API: define library targets + test (requires Eigen3, Boost is optional but strongly recommended)" ON)
 option_with_print(LIBINT2_REQUIRE_CXX_API_COMPILED
-  "Build C++11 Compiled (not just header-only) targets (requires Eigen3, Boost strongly recommended)" ON)
+        "Build C++11 Compiled (not just header-only) targets (requires Eigen3, Boost strongly recommended)" ON)
 option_with_print(LIBINT2_ENABLE_FORTRAN
-  "Build Fortran03+ Libint interface (requires Fortran)" OFF)
+        "Build Fortran03+ Libint interface (requires Fortran)" OFF)
 option_with_print(LIBINT2_ENABLE_PYTHON
-  "Build Python bindings (requires Python and Pybind11 and Eigen3)" OFF)
+        "Build Python bindings (requires Python and Pybind11 and Eigen3)" OFF)
 option_with_print(LIBINT2_PREFIX_PYTHON_INSTALL
-  "For LIBINT2_ENABLE_PYTHON=ON, whether to install the Python module in the Linux manner to CMAKE_INSTALL_PREFIX or to not install it. See target libint2-python-wheel for alternate installation in the Python manner to Python_EXECUTABLE's site-packages." OFF)
+        "For LIBINT2_ENABLE_PYTHON=ON, whether to install the Python module in the Linux manner to CMAKE_INSTALL_PREFIX or to not install it. See target libint2-python-wheel for alternate installation in the Python manner to Python_EXECUTABLE's site-packages." OFF)
 option_with_print(BUILD_SHARED_LIBS
-  "Build Libint library as shared, not static" OFF)
+        "Build Libint library as shared, not static" OFF)
 option_with_print(LIBINT2_BUILD_SHARED_AND_STATIC_LIBS
-  "Build both shared and static Libint libraries in one shot. Uses -fPIC." OFF)
+        "Build both shared and static Libint libraries in one shot. Uses -fPIC." OFF)
 option_with_print(LIBINT2_ENABLE_MPFR
-  "Use GNU MPFR library for high-precision testing (EXPERTS ONLY). Consumed at library build-time." OFF)
+        "Use GNU MPFR library for high-precision testing (EXPERTS ONLY). Consumed at library build-time." OFF)
 
 #  <<<  Which Integrals Classes, Which Derivative Levels  >>>
 
 option_with_default(LIBINT2_ENABLE_ONEBODY
-  "Compile with support for up to N-th derivatives of 1-body integrals (-1 for OFF)" 0)
+        "Compile with support for up to N-th derivatives of 1-body integrals (-1 for OFF)" 0)
 option_with_default(LIBINT2_ENABLE_ERI
-  "Compile with support for up to N-th derivatives of 4-center electron repulsion integrals (-1 for OFF)" 0)
+        "Compile with support for up to N-th derivatives of 4-center electron repulsion integrals (-1 for OFF)" 0)
 option_with_default(LIBINT2_ENABLE_ERI3
-  "Compile with support for up to N-th derivatives of 3-center electron repulsion integrals (-1 for OFF)" -1)
+        "Compile with support for up to N-th derivatives of 3-center electron repulsion integrals (-1 for OFF)" -1)
 option_with_default(LIBINT2_ENABLE_ERI2
-  "Compile with support for up to N-th derivatives of 2-center electron repulsion integrals (-1 for OFF)" -1)
+        "Compile with support for up to N-th derivatives of 2-center electron repulsion integrals (-1 for OFF)" -1)
+option_with_default(LIBINT2_ENABLE_RKB_ERI
+        "Compile with support for up to N-th derivatives of relativistic restricted kinetic
+         balance (RKB) 4-center electron repulsion integrals (-1 for OFF)" 0)
 option_with_default(LIBINT2_ENABLE_G12
-  "Compile with support for N-th derivatives of MP2-F12 energies with Gaussian factors (-1 for OFF)" -1)
+        "Compile with support for N-th derivatives of MP2-F12 energies with Gaussian factors (-1 for OFF)" -1)
 option_with_default(LIBINT2_ENABLE_G12DKH
-  "Compile with support for N-th derivatives of DKH-MP2-F12 energies with Gaussian factors (-1 for OFF)" -1)
+        "Compile with support for N-th derivatives of DKH-MP2-F12 energies with Gaussian factors (-1 for OFF)" -1)
 
 option_with_print(LIBINT2_DISABLE_ONEBODY_PROPERTY_DERIVS
-  "Disable geometric derivatives of 1-body property integrals (all but overlap, kinetic, elecpot).
+        "Disable geometric derivatives of 1-body property integrals (all but overlap, kinetic, elecpot).
    These derivatives are disabled by default to save compile time. (enable with OFF)
    Note that the libtool build won't enable this- if forcibly enabled, build_libint balks." ON)
 option_with_print(LIBINT2_ENABLE_T1G12
-  "Enable [Ti,G12] integrals when G12 integrals are enabled. Irrelevant when `LIBINT2_ENABLE_G12=OFF`. (disable with OFF)" ON)
+        "Enable [Ti,G12] integrals when G12 integrals are enabled. Irrelevant when `LIBINT2_ENABLE_G12=OFF`. (disable with OFF)" ON)
 
 #  <<<  Ordering Conventions  >>>
 
 option_with_default(LIBINT2_SHGAUSS_ORDERING
-  "Ordering for shells of solid harmonic Gaussians:
+        "Ordering for shells of solid harmonic Gaussians:
     standard -- standard ordering (-l, -l+1 ... l)
     gaussian -- the Gaussian ordering (0, 1, -1, 2, -2, ... l, -l)
    See https://github.com/evaleev/libint/blob/master/INSTALL.md#solid-harmonic-ordering-scope-and-history ." standard)
 option_with_default(LIBINT2_CARTGAUSS_ORDERING
-  "Orderings for shells of cartesian Gaussians:
+        "Orderings for shells of cartesian Gaussians:
     standard -- standard ordering (xxx, xxy, xxz, xyy, xyz, xzz, yyy, ...)
     intv3  -- intv3 ordering (yyy, yyz, yzz, zzz, xyy, xyz, xzz, xxy, xxz, xxx)
     gamess -- GAMESS ordering (xxx, yyy, zzz, xxy, xxz, yyx, yyz, zzx, zzy, xyz)
     orca -- ORCA ordering (hydrid between GAMESS and standard)
     bagel -- axis-permuted version of intv3 (xxx, xxy, xyy, yyy, xxz, xyz, yyz, xzz, yzz, zzz)" standard)
 option_with_default(LIBINT2_SHELL_SET
-  "Support computation of shell sets sets subject to these restrictions:
+        "Support computation of shell sets sets subject to these restrictions:
     standard -- standard ordering:
       for (ab|cd):
         l(a) >= l(b),
@@ -195,99 +198,107 @@ option_with_default(LIBINT2_SHELL_SET
 #    `export CMAKE_BUILD_PARALLEL_LEVEL=N`.
 
 option_with_default(LIBINT2_MAX_AM
-  "Support Gaussians of angular momentum up to N.
+        "Support Gaussians of angular momentum up to N.
    Can specify values for each derivative level as a semicolon-separated string.
    If ERI3 ints are enabled, this option also controls the AM of the paired centers." 4)
 option_with_default(LIBINT2_OPT_AM
-  "Optimize maximally for up to angular momentum N (N <= max-am).
+        "Optimize maximally for up to angular momentum N (N <= max-am).
    Can specify values for each derivative level as a semicolon-separated string. (default: (libint_max_am/2)+1)" -1)
 
 option_with_default(LIBINT2_MULTIPOLE_MAX_ORDER
-  "Maximum order of spherical multipole integrals. There is no maximum" 4)
+        "Maximum order of spherical multipole integrals. There is no maximum" 4)
 option_with_default(LIBINT2_ONEBODY_MAX_AM
-  "Support 1-body ints for Gaussians of angular momentum up to N.
+        "Support 1-body ints for Gaussians of angular momentum up to N.
    Can specify values for each derivative level as a semicolon-separated string. (default: max_am)" -1)
 option_with_default(LIBINT2_ONEBODY_OPT_AM
-  "Optimize 1-body ints maximally for up to angular momentum N (N <= max-am).
+        "Optimize 1-body ints maximally for up to angular momentum N (N <= max-am).
    Can specify values for each derivative level as a semicolon-separated string (default: (max_am/2)+1)" -1)
 
 option_with_default(LIBINT2_ERI_MAX_AM
-  "Support 4-center ERIs for Gaussians of angular momentum up to N.
+        "Support 4-center ERIs for Gaussians of angular momentum up to N.
    Can specify values for each derivative level as a semicolon-separated string. (default: max_am)" -1)
 option_with_default(LIBINT2_ERI_OPT_AM
-  "Optimize 4-center ERIs maximally for up to angular momentum N (N <= max-am).
+        "Optimize 4-center ERIs maximally for up to angular momentum N (N <= max-am).
    Can specify values for each derivative level as a semicolon-separated string (default: (max_am/2)+1)" -1)
 
+option_with_default(LIBINT2_RKB_ERI_MAX_AM
+        "Support relativistic restricted kinetic balance (RKB) 4-center ERIs for Gaussians of angular momentum up to N.
+   Can specify values for each derivative level as a semicolon-separated string. (default: max_am)" -1)
+option_with_default(LIBINT2_RKB_ERI_OPT_AM
+        "Optimize relativistic restricted kinetic balance (RKB) 4-center ERIs maximally for up to angular momentum N (N <= max-am).
+   Can specify values for each derivative level as a semicolon-separated string (default: (max_am/2)+1)" -1)
+
+
 option_with_default(LIBINT2_ERI3_MAX_AM
-  "Support 3-center ERIs for Gaussians of angular momentum up to N.
+        "Support 3-center ERIs for Gaussians of angular momentum up to N.
    Can specify values for each derivative level as a semicolon-separated string. (default: max_am)
    This option controls only the single fitting center. The paired centers use LIBINT2_MAX_AM." -1)
 option_with_default(LIBINT2_ERI3_OPT_AM
-  "Optimize 3-center ERIs maximally for up to angular momentum N (N <= max-am).
+        "Optimize 3-center ERIs maximally for up to angular momentum N (N <= max-am).
    Can specify values for each derivative level as a semicolon-separated string. (default: (max_am/2)+1)" -1)
 option_with_print(LIBINT2_ERI3_PURE_SH
-  "Assume the 'unpaired' center of 3-center ERIs will be transformed to pure solid harmonics" OFF)
+        "Assume the 'unpaired' center of 3-center ERIs will be transformed to pure solid harmonics" OFF)
 
 option_with_default(LIBINT2_ERI2_MAX_AM
-  "Support 2-center ERIs for Gaussians of angular momentum up to N.
+        "Support 2-center ERIs for Gaussians of angular momentum up to N.
     Can specify values for each derivative level as a semicolon-separated string. (default: max_am)" -1)
 option_with_default(LIBINT2_ERI2_OPT_AM
-  "Optimize 2-center ERIs maximally for up to angular momentum N (N <= max-am).
+        "Optimize 2-center ERIs maximally for up to angular momentum N (N <= max-am).
    Can specify values for each derivative level as a semicolon-separated string. (default: (max_am/2)+1)" -1)
 option_with_print(LIBINT2_ERI2_PURE_SH
-  "Assume the 2-center ERIs will be transformed to pure solid harmonics" OFF)
+        "Assume the 2-center ERIs will be transformed to pure solid harmonics" OFF)
 
 option_with_default(LIBINT2_G12_MAX_AM
-  "Support integrals for G12 methods of angular momentum up to N. (default: max_am)" -1)
+        "Support integrals for G12 methods of angular momentum up to N. (default: max_am)" -1)
 option_with_default(LIBINT2_G12_OPT_AM
-  "Optimize G12 integrals for up to angular momentum N (N <= max-am). (default: (max_am/2)+1)" -1)
+        "Optimize G12 integrals for up to angular momentum N (N <= max-am). (default: (max_am/2)+1)" -1)
 
 option_with_default(LIBINT2_G12DKH_MAX_AM
-  "Support integrals for relativistic G12 methods of angular momentum up to N. (default: max_am)" -1)
+        "Support integrals for relativistic G12 methods of angular momentum up to N. (default: max_am)" -1)
 option_with_default(LIBINT2_G12DKH_OPT_AM
-  "Optimize G12DKH integrals for up to angular momentum N (N <= max-am). (default: (max_am/2)+1)" -1)
+        "Optimize G12DKH integrals for up to angular momentum N (N <= max-am). (default: (max_am/2)+1)" -1)
 
 #  <<<  Miscellaneous  >>>
 
 option_with_print(LIBINT2_CONTRACTED_INTS
-  "Turn on support for contracted integrals." ON)
+        "Turn on support for contracted integrals." ON)
 option_with_default(LIBINT2_ERI_STRATEGY
-  "(EXPERT) Compute ERIs using the following strategy. (0 for OS, 1 for HGP, 2 for HL)" 1)
+        "(EXPERT) Compute ERIs using the following strategy. (0 for OS, 1 for HGP, 2 for HL)" 1)
 option_with_print(LIBINT2_USE_COMPOSITE_EVALUATORS
-  "Libint will use composite evaluators (i.e. every evaluator will compute one integral type only)" ON)
+        "Libint will use composite evaluators (i.e. every evaluator will compute one integral type only)" ON)
 option_with_print(LIBINT2_SINGLE_EVALTYPE
-  "Generate single evaluator type (i.e. all tasks use the same evaluator). OFF is NYI" ON)
+        "Generate single evaluator type (i.e. all tasks use the same evaluator). OFF is NYI" ON)
 option_with_default(LIBINT2_ENABLE_UNROLLING
-  "Unroll shell sets into integrals (will unroll shell sets larger than N) (0 for never, N for N, 1000000000 for always)" 100)
+        "Unroll shell sets into integrals (will unroll shell sets larger than N) (0 for never, N for N, 1000000000 for always)" 100)
 option_with_default(LIBINT2_ALIGN_SIZE
-  "(EXPERT) if posix_memalign is available, this will specify alignment of Libint data, in units of
+        "(EXPERT) if posix_memalign is available, this will specify alignment of Libint data, in units of
    sizeof(LIBINT2_REALTYPE). Default is to use built-in heuristics: system-determined for vectorization off (default) or veclen * sizeof(LIBINT2_REALTYPE) for vectorization on." 0)
 mark_as_advanced(LIBINT2_ALIGN_SIZE)
 option_with_default(LIBINT2_REALTYPE
-  "Specifies the floating-point data type used by the library. Consumed at library build-time." double)
+        "Specifies the floating-point data type used by the library. Consumed at library build-time." double)
 option_with_print(LIBINT2_USER_DEFINED_REAL_INCLUDES
-  "Additional #includes necessary to use the real type." OFF)
+        "Additional #includes necessary to use the real type." OFF)
 include(int_userreal)
 option_with_print(LIBINT2_GENERATE_FMA
-  "Generate FMA (fused multiply-add) instructions (to benefit must have FMA-capable hardware and compiler)" OFF)
+        "Generate FMA (fused multiply-add) instructions (to benefit must have FMA-capable hardware and compiler)" OFF)
 option_with_print(LIBINT2_ENABLE_GENERIC_CODE
   "Use manually-written generic code" OFF)
 option_with_default(LIBINT2_API_PREFIX
   "Prepend this string to every name in the library API (except for the types)." "")
 option_with_print(LIBINT2_VECTOR_LENGTH
-  "Compute integrals in vectors of length N." OFF)
+        "Compute integrals in vectors of length N." OFF)
 option_with_default(LIBINT2_VECTOR_METHOD
-  "Specifies how to vectorize integrals. Irrelevant when `LIBINT2_VECTOR_LENGTH=OFF. Allowed values are 'block' (default), and 'line'." block)
+        "Specifies how to vectorize integrals. Irrelevant when `LIBINT2_VECTOR_LENGTH=OFF. Allowed values are 'block' (default), and 'line'." block)
 option_with_print(LIBINT2_ACCUM_INTS
-  "Accumulate integrals to the buffer, rather than copy (OFF for copy, ON for accum)." OFF)
+        "Accumulate integrals to the buffer, rather than copy (OFF for copy, ON for accum)." OFF)
 option_with_print(LIBINT2_FLOP_COUNT
-  "Support (approximate) FLOP counting by the library. (Generated code will require C++11!)" OFF)
+        "Support (approximate) FLOP counting by the library. (Generated code will require C++11!)" OFF)
 option_with_print(LIBINT2_PROFILE
-  "Turn on profiling instrumentation of the library. (Generated code will require C++11!)" OFF)
+        "Turn on profiling instrumentation of the library. (Generated code will require C++11!)" OFF)
 option_with_print(LIBINT2_ENABLE_MPFR
-  "Use GNU MPFR library for high-precision testing (EXPERTS ONLY). Consumed at library build-time." OFF)
+        "Use GNU MPFR library for high-precision testing (EXPERTS ONLY). Consumed at library build-time." OFF)
 option_with_default(LIBINT2_EXPORT_COMPRESSOR
-  "Export tarball with compression gzip or bzip2" gzip)
+        "Export tarball with compression gzip or bzip2" gzip)
 # next one defined by `include(CTest)`
 message(STATUS "Showing option BUILD_TESTING: ${BUILD_TESTING}")
 
@@ -304,13 +315,13 @@ include(int_am)
 check_function_exists(posix_memalign HAVE_POSIX_MEMALIGN)
 if (NOT HAVE_POSIX_MEMALIGN)
     message(FATAL_ERROR "did not find posix_memalign ... this SHOULD NOT happen. Cannot proceed.")
-endif()
+endif ()
 
 check_include_file_cxx(stdint.h HAVE_STDINT_H)  # limits.h?
 
 if (cxx_std_11 IN_LIST CMAKE_CXX_COMPILE_FEATURES)
     set(LIBINT_HAS_CXX11 1)
-endif()
+endif ()
 
 booleanize01(LIBINT2_ERI3_PURE_SH)
 booleanize01(LIBINT2_ERI2_PURE_SH)
@@ -334,9 +345,9 @@ elseif (LIBINT2_EXPORT_COMPRESSOR STREQUAL "xz")
 elseif (LIBINT2_EXPORT_COMPRESSOR STREQUAL "bzip2")
     set(LIBINT_EXPORT_COMPRESSOR_CMD "jcf")
     set(LIBINT_EXPORT_COMPRESSOR_EXT "tbz2")
-else()
+else ()
     message(FATAL_ERROR "No valid compressor; invoke CMake with -DLIBINT2_EXPORT_COMPRESSOR=gzip|bzip2")
-endif()
+endif ()
 
 
 ################################## Dependencies #################################
@@ -347,9 +358,9 @@ if (LIBINT2_ENABLE_MPFR)
     # mpfr detected in CMakeLists.txt.export at appropriate time for library, but prechecking here
     find_package(Multiprecision MODULE REQUIRED COMPONENTS gmpxx mpfr)
     set(LIBINT_HAS_MPFR 1)
-else()
+else ()
     find_package(Multiprecision MODULE REQUIRED COMPONENTS gmpxx)
-endif()
+endif ()
 
 get_property(_loc TARGET Multiprecision::gmp PROPERTY LOCATION)
 message(VERBOSE "${Cyan}Found GMP${ColourReset}: ${_loc}")
@@ -358,12 +369,12 @@ message(VERBOSE "${Cyan}Found GMPXX${ColourReset}: ${_loc}")
 if (TARGET Multiprecision::mpfr)
     get_property(_loc TARGET Multiprecision::mpfr PROPERTY LOCATION)
     message(VERBOSE "${Cyan}Found MPFR${ColourReset}: ${_loc} (found version ${MPFR_VERSION})")
-endif()
+endif ()
 
 find_package(Boost 1.57 REQUIRED)
 if (TARGET Boost::headers)
     set(LIBINT_HAS_SYSTEM_BOOST_PREPROCESSOR_VARIADICS 1)
-endif()
+endif ()
 
 # deferring find_package(Eigen3) to library (CMakeLists.txt.export)
 
@@ -373,9 +384,9 @@ endif()
 set(EXPORT_STAGE_DIR ${PROJECT_BINARY_DIR}/libint-${LIBINT_EXT_VERSION})
 
 configure_file(
-  cmake/modules/int_computed.cmake.in
-  cmake/modules/int_computed.cmake
-  @ONLY)
+        cmake/modules/int_computed.cmake.in
+        cmake/modules/int_computed.cmake
+        @ONLY)
 
 # CMake data transmitted to C++ via config.h for generator/compiler (_EXPORT_MODE=0).
 #   Same info is positioned for the library export, but _EXPORT_MODE=1 turns on
@@ -383,28 +394,28 @@ configure_file(
 #   library build time.
 set(_EXPORT_MODE 0)
 # convert user-facing LIBINT2_ variables to LIBINT_ internal variables
-foreach(_var API_PREFIX;ERI3_PURE_SH;ERI2_PURE_SH;DISABLE_ONEBODY_PROPERTY_DERIVS;ENABLE_UNROLLING;ENABLE_GENERIC_CODE;VECTOR_LENGTH;VECTOR_METHOD;ALIGN_SIZE;USER_DEFINED_REAL;USER_DEFINED_REAL_INCLUDES;GENERATE_FMA;ACCUM_INTS;FLOP_COUNT;PROFILE;CONTRACTED_INTS;SINGLE_EVALTYPE;USE_COMPOSITE_EVALUATORS;ERI_STRATEGY;MULTIPOLE_MAX_ORDER)
+foreach (_var API_PREFIX;ERI3_PURE_SH;ERI2_PURE_SH;DISABLE_ONEBODY_PROPERTY_DERIVS;ENABLE_UNROLLING;ENABLE_GENERIC_CODE;VECTOR_LENGTH;VECTOR_METHOD;ALIGN_SIZE;USER_DEFINED_REAL;USER_DEFINED_REAL_INCLUDES;GENERATE_FMA;ACCUM_INTS;FLOP_COUNT;PROFILE;CONTRACTED_INTS;SINGLE_EVALTYPE;USE_COMPOSITE_EVALUATORS;ERI_STRATEGY;MULTIPOLE_MAX_ORDER)
     if (DEFINED LIBINT2_${_var})
         if (DEFINED LIBINT_${_var})
             message(FATAL_ERROR "renaming user-facing LIBINT2_${_var} variable but internal variable LIBINT_${_var} already exists")
         else ()
             set(LIBINT_${_var} ${LIBINT2_${_var}})
-        endif()
-    endif()
+        endif ()
+    endif ()
 endforeach ()
 configure_file(
-  include/libint2/config.h.cmake.in
-  include/libint2/config.h
-  @ONLY)
+        include/libint2/config.h.cmake.in
+        include/libint2/config.h
+        @ONLY)
 set(_EXPORT_MODE 1)
 configure_file(
-  include/libint2/config.h.cmake.in
-  ${EXPORT_STAGE_DIR}/include/libint2/config.h
-  @ONLY)
+        include/libint2/config.h.cmake.in
+        ${EXPORT_STAGE_DIR}/include/libint2/config.h
+        @ONLY)
 configure_file(
-  include/libint2/config2.h.cmake.in
-  ${EXPORT_STAGE_DIR}/include/libint2/config2.h.cmake.in
-  COPYONLY)
+        include/libint2/config2.h.cmake.in
+        ${EXPORT_STAGE_DIR}/include/libint2/config2.h.cmake.in
+        COPYONLY)
 
 add_subdirectory(src)
 
diff --git a/cmake/modules/int_am.cmake b/cmake/modules/int_am.cmake
index c1d61cd55..350924f49 100644
--- a/cmake/modules/int_am.cmake
+++ b/cmake/modules/int_am.cmake
@@ -262,7 +262,7 @@ macro(process_integrals_class class)
             if (LIBINT2_${class}_OPT_AM EQUAL -1)
                 set(LIBINT_${class}_OPT_AM "")
             else()
-                set($LIBINT_{class}_OPT_AM ${LIBINT2_${class}_OPT_AM})
+                set(LIBINT_${class}_OPT_AM ${LIBINT2_${class}_OPT_AM})
             endif()
         endif()
         if (LIBINT_OPT_AM_LIST)
@@ -357,6 +357,7 @@ endmacro()
 
 process_integrals_class(ONEBODY)
 process_integrals_class(ERI)
+process_integrals_class(RKB_ERI)
 process_integrals_class(ERI3)
 process_integrals_class(ERI2)
 # unlike above, these classes (1) don't do AM_LIST and (2) require value in config.h if enabled
@@ -396,7 +397,7 @@ list(REVERSE _amlist)
 list(APPEND Libint2_ERI_COMPONENTS "${_amlist}")
 message(VERBOSE "setting components ${_amlist}")
 
-foreach(_cls ONEBODY;ERI;ERI3;ERI2;G12;G12DKH)
+foreach(_cls ONEBODY;ERI;RKB_ERI;ERI3;ERI2;G12;G12DKH)
     if((_cls STREQUAL G12) OR (_cls STREQUAL G12DKH))
         add_feature_info(
           "integral class ${_cls}"
@@ -433,6 +434,8 @@ foreach(_cls ONEBODY;ERI;ERI3;ERI2;G12;G12DKH)
                     list(APPEND _amlist         "onebody_${_am${_l}}${_am${_l}}_d${_d}")
                 elseif (_cls STREQUAL "G12")
                     list(APPEND _amlist             "g12_${_am${_l}}${_am${_l}}${_am${_l}}${_am${_l}}_d${_d}")
+                elseif (_cls STREQUAL "RKB_ERI")
+                    list(APPEND _amlist             "rkb_eri_${_am${_l}}${_am${_l}}${_am${_l}}${_am${_l}}_d${_d}")
                 endif()
             endforeach()
             if (_cls STREQUAL "ERI3")
diff --git a/export/tests/unit/test-1body.cc b/export/tests/unit/test-1body.cc
index 8c27be49f..5d0458b60 100644
--- a/export/tests/unit/test-1body.cc
+++ b/export/tests/unit/test-1body.cc
@@ -244,6 +244,73 @@ TEST_CASE_METHOD(libint2::unit::DefaultFixture, "W correctness",
 #endif  // LIBINT2_SUPPORT_ONEBODY
 }
 
+TEST_CASE_METHOD(libint2::unit::DefaultFixture, "σpRσp correctness",
+                 "[engine][1-body]") {
+#if defined(LIBINT2_SUPPORT_ONEBODY)
+  if (LIBINT_SHGSHELL_ORDERING != LIBINT_SHGSHELL_ORDERING_STANDARD) return;
+
+  // Two contracted Gaussian shells at distinct centers. We exercise
+  // `(σ·p) r (σ·p)` over the (s|d) and (d|s) shell pairs and validate
+  // Hermiticity: trace components (q=0) are symmetric under bra↔ket swap,
+  // antisym components (q=1,2,3) are antisymmetric.
+  std::vector<Shell> obs{
+      Shell{{1.0, 3.0}, {{0, true, {1.0, 0.3}}}, {{0.0, 0.0, 0.0}}},
+      Shell{{2.0, 5.0}, {{2, true, {1.0, 0.2}}}, {{1.0, 1.0, 1.0}}}};
+
+  const auto lmax = std::min(2, LIBINT2_MAX_AM_oprop);
+  if (lmax < 2) return;
+
+  auto engine = Engine(Operator::oprop, 2, lmax);
+  engine.set_params(std::array<double, 3>{{0.0, 0.0, 0.0}});
+
+  // (s|σpRσp|d) and (d|σpRσp|s)
+  engine.compute(obs[0], obs[1]);
+  std::array<std::vector<double>, 12> ab;
+  for (int c = 0; c < 12; ++c) {
+    const auto* buf = engine.results()[c];
+    REQUIRE(buf != nullptr);
+    ab[c].assign(buf, buf + (1 * 5));  // n_s × n_d_pure = 1 × 5
+  }
+
+  engine.compute(obs[1], obs[0]);
+  std::array<std::vector<double>, 12> ba;
+  for (int c = 0; c < 12; ++c) {
+    const auto* buf = engine.results()[c];
+    REQUIRE(buf != nullptr);
+    ba[c].assign(buf, buf + (5 * 1));  // n_d_pure × n_s
+  }
+
+  // Hermiticity check: σpRσp is Hermitian, but the Pauli identity routes the
+  // imaginary i factor on the antisym pieces into a real-stored sign flip.
+  //   q=0 trace: matrix is symmetric  ⇒  ab[0+k][i,j] ==  ba[0+k][j,i]
+  //   q=1..3   : matrix is antisym    ⇒  ab[q+k][i,j] == -ba[q+k][j,i]
+  // (Indices: ab is laid out (i=0..n_s-1, j=0..n_d-1); ba is (j, i).)
+  const double tol = 1.0e-12;
+  for (int k = 0; k < 3; ++k) {
+    for (int q = 0; q < 4; ++q) {
+      const int comp = 4 * k + q;
+      const double expected_sign = (q == 0) ? +1.0 : -1.0;
+      for (int i = 0; i < 1; ++i) {
+        for (int j = 0; j < 5; ++j) {
+          const double v_ab = ab[comp][i * 5 + j];
+          const double v_ba = ba[comp][j * 1 + i];
+          REQUIRE(std::isfinite(v_ab));
+          REQUIRE(std::abs(v_ab - expected_sign * v_ba) < tol);
+        }
+      }
+    }
+  }
+
+  // Sanity: not every component is identically zero (would mask codegen bugs).
+  bool any_nonzero = false;
+  for (int c = 0; c < 12; ++c)
+    for (double v : ab[c])
+      if (std::abs(v) > 1.0e-10) any_nonzero = true;
+  REQUIRE(any_nonzero);
+#endif  // LIBINT2_SUPPORT_ONEBODY
+}
+
+
 // Helper: max number of primitives across all centers in q_gau data.
 static size_t max_nprim(const libint2::GaussianPotentialCentersData& data) {
   size_t result = 0;
diff --git a/export/tests/unit/test-2body.cc b/export/tests/unit/test-2body.cc
index fd602a910..dc0c01b73 100644
--- a/export/tests/unit/test-2body.cc
+++ b/export/tests/unit/test-2body.cc
@@ -1,5 +1,5 @@
 /*
- *  Copyright (C) 2004-2024 Edward F. Valeev
+ *  Copyright (C) 2004-2026 Edward F. Valeev
  *
  *  This file is part of Libint library.
  *
@@ -344,6 +344,438 @@ TEST_CASE("eri geometric derivatives", "[engine][2-body]") {
   }
 }
 
+TEST_CASE("RKB Coulomb integrals", "[engine][2-body]") {
+  std::vector<Shell> obs{// pseudorandom s
+                         Shell{{1.0}, {{0, false, {1.0}}}, {{0.0, 0.0, 0.0}}},
+                         // pseudorandom p
+                         Shell{{2.0}, {{1, false, {1.0}}}, {{1.0, 1.0, 1.0}}}};
+
+  const auto max_nprim = libint2::max_nprim(obs);
+  const auto max_l = libint2::max_l(obs);
+  typedef std::array<unsigned int, 12> der_idx;
+
+  // e.g. d_xx maps the derivative index of derivative w.r.t x
+  // coord of ket1 and x coord of ket2 in Chemist notation.
+  // deriv indices for (LL|SS)
+  der_idx d_xx = {0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0};
+  der_idx d_yy = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0};
+  der_idx d_zz = {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1};
+  der_idx d_yz = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1};
+  der_idx d_zy = {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0};
+  der_idx d_zx = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0};
+  der_idx d_xz = {0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1};
+  der_idx d_xy = {0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0};
+  der_idx d_yx = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0};
+
+  // deriv indices for (SS|SS)
+  // 0th component
+  der_idx xxxx = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
+  der_idx yyxx = {0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0};
+  der_idx zzxx = {0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0};
+  der_idx yxyx = {0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0};
+  der_idx xyyx = {1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0};
+  der_idx yxxy = {0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0};
+  der_idx xyxy = {1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0};
+  der_idx xxyy = {1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0};
+  der_idx yyyy = {0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0};
+  der_idx zzyy = {0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0};
+  der_idx xxzz = {1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1};
+  der_idx yyzz = {0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1};
+  der_idx zzzz = {0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1};
+
+  // x-component
+  der_idx zxzx = {0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0};
+  der_idx xzzx = {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0};
+  der_idx zyzy = {0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0};
+  der_idx yzzy = {0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0};
+  der_idx zxxz = {0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1};
+  der_idx xzxz = {1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1};
+  der_idx zyyz = {0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1};
+  der_idx yzyz = {0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1};
+
+  // y-component
+  der_idx zyzx = {0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0};
+  der_idx yzzx = {0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0};
+  der_idx zxzy = {0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0};
+  der_idx xzzy = {1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0};
+  der_idx zyxz = {0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1};
+  der_idx yzxz = {0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1};
+  der_idx zxyz = {0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1};
+  der_idx xzyz = {1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1};
+
+  // z-component
+  der_idx yxxx = {0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
+  der_idx xyxx = {1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0};
+  der_idx xxyx = {1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0};
+  der_idx yyyx = {0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0};
+  der_idx zzyx = {0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0};
+  der_idx xxxy = {1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0};
+  der_idx yyxy = {0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0};
+  der_idx zzxy = {0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0};
+  der_idx yxyy = {0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0};
+  der_idx xyyy = {1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0};
+  der_idx yxzz = {0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1};
+  der_idx xyzz = {1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1};
+
+  SECTION("Coulombσpσp and σpσpCoulombσpσp") {
+    Engine engine_llss, engine_ssss;
+    try {
+      engine_llss = Engine(Operator::coulomb_opop, max_nprim, max_l, 0);
+      engine_ssss = Engine(Operator::opop_coulomb_opop, max_nprim, max_l, 0);
+      // TODO: need another unit test for derivatives of RKB ERIs
+    } catch (
+        Engine::lmax_exceeded &) {  // skip the test if lmax exceeded or libint2
+                                    // not configured with RKB support
+      return;
+    }
+
+    const auto nshell = obs.size();
+    for (int s0 = 0; s0 != nshell; ++s0) {
+      for (int s1 = 0; s1 != nshell; ++s1) {
+        for (int s2 = 0; s2 != nshell; ++s2) {
+          for (int s3 = 0; s3 != nshell; ++s3) {
+            const auto &results_llss =
+                engine_llss.compute(obs[s0], obs[s1], obs[s2], obs[s3]);
+            const auto &results_ssss =
+                engine_ssss.compute(obs[s0], obs[s1], obs[s2], obs[s3]);
+            assert(results_llss.size() ==
+                   4);  // 4 buffers for single-spin quaternion components
+
+            LIBINT2_REF_REALTYPE Aref[3];
+            for (int i = 0; i < 3; ++i) Aref[i] = obs[s0].O[i];
+            LIBINT2_REF_REALTYPE Bref[3];
+            for (int i = 0; i < 3; ++i) Bref[i] = obs[s1].O[i];
+            LIBINT2_REF_REALTYPE Cref[3];
+            for (int i = 0; i < 3; ++i) Cref[i] = obs[s2].O[i];
+            LIBINT2_REF_REALTYPE Dref[3];
+            for (int i = 0; i < 3; ++i) Dref[i] = obs[s3].O[i];
+
+            int ijkl = 0;
+
+            int l0, m0, n0;
+            FOR_CART(l0, m0, n0, obs[s0].contr[0].l)
+
+            int l1, m1, n1;
+            FOR_CART(l1, m1, n1, obs[s1].contr[0].l)
+
+            int l2, m2, n2;
+            FOR_CART(l2, m2, n2, obs[s2].contr[0].l)
+
+            int l3, m3, n3;
+            FOR_CART(l3, m3, n3, obs[s3].contr[0].l)
+
+            std::array<LIBINT2_REF_REALTYPE, 4> ref_coulomb_opop{0.0, 0.0, 0.0,
+                                                                 0.0};
+            std::array<LIBINT2_REF_REALTYPE, 16> ref_opop_coulomb_opop{};
+            ref_opop_coulomb_opop.fill(0.0);
+            uint p0123 = 0;
+            for (uint p0 = 0; p0 < obs[s0].nprim(); p0++) {
+              for (uint p1 = 0; p1 < obs[s1].nprim(); p1++) {
+                for (uint p2 = 0; p2 < obs[s2].nprim(); p2++) {
+                  for (uint p3 = 0; p3 < obs[s3].nprim(); p3++, p0123++) {
+                    const LIBINT2_REF_REALTYPE alpha0 = obs[s0].alpha[p0];
+                    const LIBINT2_REF_REALTYPE alpha1 = obs[s1].alpha[p1];
+                    const LIBINT2_REF_REALTYPE alpha2 = obs[s2].alpha[p2];
+                    const LIBINT2_REF_REALTYPE alpha3 = obs[s3].alpha[p3];
+
+                    const LIBINT2_REF_REALTYPE c0 = obs[s0].contr[0].coeff[p0];
+                    const LIBINT2_REF_REALTYPE c1 = obs[s1].contr[0].coeff[p1];
+                    const LIBINT2_REF_REALTYPE c2 = obs[s2].contr[0].coeff[p2];
+                    const LIBINT2_REF_REALTYPE c3 = obs[s3].contr[0].coeff[p3];
+                    const LIBINT2_REF_REALTYPE c0123 = c0 * c1 * c2 * c3;
+
+                    auto eri_drrrr = [&](der_idx d_rrrr) {
+                      return eri(d_rrrr.data(), l0, m0, n0, alpha0, Aref, l1,
+                                 m1, n1, alpha1, Bref, l2, m2, n2, alpha2, Cref,
+                                 l3, m3, n3, alpha3, Dref, 0);
+                    };
+
+                    // helper: build der_idx from 4 derivative directions
+                    // (0=x, 1=y, 2=z) for centers A, B, C, D
+                    constexpr int X = 0, Y = 1, Z = 2;
+                    auto didx = [](int a, int b, int c, int d) -> der_idx {
+                      der_idx r = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+                      r[a] = 1;
+                      r[3 + b] = 1;
+                      r[6 + c] = 1;
+                      r[9 + d] = 1;
+                      return r;
+                    };
+                    // shorthand: evaluate derivative ERI from 4 directions
+                    auto D = [&](int a, int b, int c, int d) {
+                      return eri_drrrr(didx(a, b, c, d));
+                    };
+
+                    // (LL|SS)
+                    ref_coulomb_opop[0] +=
+                        c0123 *
+                        (eri_drrrr(d_xx) + eri_drrrr(d_yy) + eri_drrrr(d_zz));
+                    ref_coulomb_opop[1] +=
+                        c0123 * (eri_drrrr(d_yz) - eri_drrrr(d_zy));
+                    ref_coulomb_opop[2] +=
+                        c0123 * (eri_drrrr(d_zx) - eri_drrrr(d_xz));
+                    ref_coulomb_opop[3] +=
+                        c0123 * (eri_drrrr(d_xy) - eri_drrrr(d_yx));
+
+                    // (SS|SS) — 16 components, Option A: index = 4*bra + ket
+                    // 0:SS  1:SX  2:SY  3:SZ
+                    ref_opop_coulomb_opop[0] +=
+                        c0123 * (D(X, X, X, X) + D(X, X, Y, Y) + D(X, X, Z, Z) +
+                                 D(Y, Y, X, X) + D(Y, Y, Y, Y) + D(Y, Y, Z, Z) +
+                                 D(Z, Z, X, X) + D(Z, Z, Y, Y) + D(Z, Z, Z, Z));
+                    ref_opop_coulomb_opop[1] +=
+                        c0123 * (D(X, X, Y, Z) - D(X, X, Z, Y) + D(Y, Y, Y, Z) -
+                                 D(Y, Y, Z, Y) + D(Z, Z, Y, Z) - D(Z, Z, Z, Y));
+                    ref_opop_coulomb_opop[2] +=
+                        c0123 * (D(X, X, Z, X) - D(X, X, X, Z) + D(Y, Y, Z, X) -
+                                 D(Y, Y, X, Z) + D(Z, Z, Z, X) - D(Z, Z, X, Z));
+                    ref_opop_coulomb_opop[3] +=
+                        c0123 * (D(X, X, X, Y) - D(X, X, Y, X) + D(Y, Y, X, Y) -
+                                 D(Y, Y, Y, X) + D(Z, Z, X, Y) - D(Z, Z, Y, X));
+                    // 4:XS  5:XX  6:XY  7:XZ
+                    ref_opop_coulomb_opop[4] +=
+                        c0123 * (D(Y, Z, X, X) - D(Z, Y, X, X) + D(Y, Z, Y, Y) -
+                                 D(Z, Y, Y, Y) + D(Y, Z, Z, Z) - D(Z, Y, Z, Z));
+                    ref_opop_coulomb_opop[5] +=
+                        c0123 * (-D(Y, Z, Y, Z) + D(Y, Z, Z, Y) +
+                                 D(Z, Y, Y, Z) - D(Z, Y, Z, Y));
+                    ref_opop_coulomb_opop[6] +=
+                        c0123 * (-D(Y, Z, Z, X) + D(Y, Z, X, Z) +
+                                 D(Z, Y, Z, X) - D(Z, Y, X, Z));
+                    ref_opop_coulomb_opop[7] +=
+                        c0123 * (-D(Y, Z, X, Y) + D(Y, Z, Y, X) +
+                                 D(Z, Y, X, Y) - D(Z, Y, Y, X));
+                    // 8:YS  9:YX  10:YY  11:YZ
+                    ref_opop_coulomb_opop[8] +=
+                        c0123 * (D(Z, X, X, X) - D(X, Z, X, X) + D(Z, X, Y, Y) -
+                                 D(X, Z, Y, Y) + D(Z, X, Z, Z) - D(X, Z, Z, Z));
+                    ref_opop_coulomb_opop[9] +=
+                        c0123 * (-D(Z, X, Y, Z) + D(Z, X, Z, Y) +
+                                 D(X, Z, Y, Z) - D(X, Z, Z, Y));
+                    ref_opop_coulomb_opop[10] +=
+                        c0123 * (-D(Z, X, Z, X) + D(Z, X, X, Z) +
+                                 D(X, Z, Z, X) - D(X, Z, X, Z));
+                    ref_opop_coulomb_opop[11] +=
+                        c0123 * (-D(Z, X, X, Y) + D(Z, X, Y, X) +
+                                 D(X, Z, X, Y) - D(X, Z, Y, X));
+                    // 12:ZS  13:ZX  14:ZY  15:ZZ
+                    ref_opop_coulomb_opop[12] +=
+                        c0123 * (D(X, Y, X, X) - D(Y, X, X, X) + D(X, Y, Y, Y) -
+                                 D(Y, X, Y, Y) + D(X, Y, Z, Z) - D(Y, X, Z, Z));
+                    ref_opop_coulomb_opop[13] +=
+                        c0123 * (-D(X, Y, Y, Z) + D(X, Y, Z, Y) +
+                                 D(Y, X, Y, Z) - D(Y, X, Z, Y));
+                    ref_opop_coulomb_opop[14] +=
+                        c0123 * (-D(X, Y, Z, X) + D(X, Y, X, Z) +
+                                 D(Y, X, Z, X) - D(Y, X, X, Z));
+                    ref_opop_coulomb_opop[15] +=
+                        c0123 * (-D(X, Y, X, Y) + D(X, Y, Y, X) +
+                                 D(Y, X, X, Y) - D(Y, X, Y, X));
+                  }
+                }
+              }
+            }
+
+            const double ABSOLUTE_DEVIATION_THRESHOLD = 5.0E-14;
+            const double RELATIVE_DEVIATION_THRESHOLD =
+                1.0E-9;  // For more detail on choice of these thresholds, see
+                         // the comments in the TEST_CASE "eri geometric
+                         // derivatives"
+
+            std::array<LIBINT2_REF_REALTYPE, 4> abs_errs_llss;
+            std::array<LIBINT2_REF_REALTYPE, 4> rel_abs_errs_llss;
+
+            // (LL|SS) has 4 components
+            for (auto comp = 0; comp < 4; ++comp) {
+              abs_errs_llss[comp] =
+                  abs(ref_coulomb_opop[comp] - results_llss[comp][ijkl]);
+              rel_abs_errs_llss[comp] =
+                  abs(abs_errs_llss[comp] / ref_coulomb_opop[comp]);
+
+              bool llss_not_ok =
+                  rel_abs_errs_llss[comp] > RELATIVE_DEVIATION_THRESHOLD &&
+                  abs_errs_llss[comp] > ABSOLUTE_DEVIATION_THRESHOLD;
+
+              if (llss_not_ok) {
+                std::cout << "(l0 l1| l2 l3) = "
+                          << "(" << s0 << " " << s1 << " | " << s2 << " " << s3
+                          << ") "
+                          << "Elem " << ijkl << " comp= " << comp
+                          << " : ref = " << ref_coulomb_opop[comp]
+                          << " libint = " << results_llss[comp][ijkl]
+                          << " relabs_error = " << rel_abs_errs_llss[comp]
+                          << " abs_error = " << abs_errs_llss[comp]
+                          << std::endl;
+              }
+              REQUIRE(!llss_not_ok);
+            }
+
+            // (SS|SS) has 16 components (two independent spin spaces)
+            for (auto comp = 0; comp < 16; ++comp) {
+              auto abs_err_ssss =
+                  abs(ref_opop_coulomb_opop[comp] - results_ssss[comp][ijkl]);
+              auto rel_abs_err_ssss =
+                  abs(abs_err_ssss / ref_opop_coulomb_opop[comp]);
+
+              bool ssss_not_ok =
+                  rel_abs_err_ssss > RELATIVE_DEVIATION_THRESHOLD &&
+                  abs_err_ssss > ABSOLUTE_DEVIATION_THRESHOLD;
+
+              if (ssss_not_ok) {
+                std::cout << "(l0 l1| l2 l3) = "
+                          << "(" << s0 << " " << s1 << " | " << s2 << " " << s3
+                          << ") "
+                          << "Elem " << ijkl << " comp= " << comp
+                          << " : ref = " << ref_opop_coulomb_opop[comp]
+                          << " libint = " << results_ssss[comp][ijkl]
+                          << " relabs_error = " << rel_abs_err_ssss
+                          << " abs_error = " << abs_err_ssss << std::endl;
+              }
+              REQUIRE(!ssss_not_ok);
+            }
+
+            ++ijkl;
+            END_FOR_CART
+            END_FOR_CART
+            END_FOR_CART
+            END_FOR_CART
+          }
+        }
+      }
+    }
+  }
+
+  SECTION("op_coulomb_op") {
+    Engine engine_opCop;
+    try {
+      engine_opCop = Engine(Operator::op_coulomb_op, max_nprim, max_l, 0);
+    } catch (Engine::lmax_exceeded &) {
+      return;
+    }
+
+    const auto nshell = obs.size();
+    for (int s0 = 0; s0 != nshell; ++s0) {
+      for (int s1 = 0; s1 != nshell; ++s1) {
+        for (int s2 = 0; s2 != nshell; ++s2) {
+          for (int s3 = 0; s3 != nshell; ++s3) {
+            const auto &results =
+                engine_opCop.compute(obs[s0], obs[s1], obs[s2], obs[s3]);
+            assert(results.size() == 9);
+
+            LIBINT2_REF_REALTYPE Aref[3], Bref[3], Cref[3], Dref[3];
+            for (int i = 0; i < 3; ++i) Aref[i] = obs[s0].O[i];
+            for (int i = 0; i < 3; ++i) Bref[i] = obs[s1].O[i];
+            for (int i = 0; i < 3; ++i) Cref[i] = obs[s2].O[i];
+            for (int i = 0; i < 3; ++i) Dref[i] = obs[s3].O[i];
+
+            int ijkl = 0;
+
+            int l0, m0, n0;
+            FOR_CART(l0, m0, n0, obs[s0].contr[0].l)
+            int l1, m1, n1;
+            FOR_CART(l1, m1, n1, obs[s1].contr[0].l)
+            int l2, m2, n2;
+            FOR_CART(l2, m2, n2, obs[s2].contr[0].l)
+            int l3, m3, n3;
+            FOR_CART(l3, m3, n3, obs[s3].contr[0].l)
+
+            // Raw 3x3 dyadic T_{ab} = (a b∂_a | c d∂_b), accumulated in
+            // chemist-notation index 3*a+b. After the primitive loop we
+            // project into the 9 SO(3) irreducible components that the engine
+            // returns: Scalar, AntisymX/Y/Z, SymTLDiagA/B, SymTLOffXY/XZ/YZ.
+            std::array<LIBINT2_REF_REALTYPE, 9> ref_raw{};
+            ref_raw.fill(0.0);
+
+            for (uint p0 = 0; p0 < obs[s0].nprim(); p0++) {
+              for (uint p1 = 0; p1 < obs[s1].nprim(); p1++) {
+                for (uint p2 = 0; p2 < obs[s2].nprim(); p2++) {
+                  for (uint p3 = 0; p3 < obs[s3].nprim(); p3++) {
+                    const LIBINT2_REF_REALTYPE alpha0 = obs[s0].alpha[p0];
+                    const LIBINT2_REF_REALTYPE alpha1 = obs[s1].alpha[p1];
+                    const LIBINT2_REF_REALTYPE alpha2 = obs[s2].alpha[p2];
+                    const LIBINT2_REF_REALTYPE alpha3 = obs[s3].alpha[p3];
+                    const LIBINT2_REF_REALTYPE c0 = obs[s0].contr[0].coeff[p0];
+                    const LIBINT2_REF_REALTYPE c1 = obs[s1].contr[0].coeff[p1];
+                    const LIBINT2_REF_REALTYPE c2 = obs[s2].contr[0].coeff[p2];
+                    const LIBINT2_REF_REALTYPE c3 = obs[s3].contr[0].coeff[p3];
+                    const LIBINT2_REF_REALTYPE c0123 = c0 * c1 * c2 * c3;
+
+                    // Deriv on ν (center B, index 1) and on λ (center D, idx
+                    // 3).
+                    auto didx_bd = [](int a, int b) -> der_idx {
+                      der_idx r = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+                      r[3 + a] = 1;
+                      r[9 + b] = 1;
+                      return r;
+                    };
+                    auto D = [&](int a, int b) {
+                      auto di = didx_bd(a, b);
+                      return eri(di.data(), l0, m0, n0, alpha0, Aref, l1, m1,
+                                 n1, alpha1, Bref, l2, m2, n2, alpha2, Cref, l3,
+                                 m3, n3, alpha3, Dref, 0);
+                    };
+                    for (int a = 0; a < 3; ++a)
+                      for (int b = 0; b < 3; ++b)
+                        ref_raw[3 * a + b] += c0123 * D(a, b);
+                  }
+                }
+              }
+            }
+
+            // Project raw dyadic into the 9 SO(3) irrep components used by
+            // op_coulomb_op (must match σpCoulombσp_Descr::Component order).
+            const auto Txx = ref_raw[0];
+            const auto Txy = ref_raw[1];
+            const auto Txz = ref_raw[2];
+            const auto Tyx = ref_raw[3];
+            const auto Tyy = ref_raw[4];
+            const auto Tyz = ref_raw[5];
+            const auto Tzx = ref_raw[6];
+            const auto Tzy = ref_raw[7];
+            const auto Tzz = ref_raw[8];
+            std::array<LIBINT2_REF_REALTYPE, 9> ref_op_coulomb_op{
+                Txx + Tyy + Tzz,        // Scalar
+                Tyz - Tzy,              // AntisymX
+                Tzx - Txz,              // AntisymY
+                Txy - Tyx,              // AntisymZ
+                Txx - Tyy,              // SymTLDiagA
+                2.0 * Tzz - Txx - Tyy,  // SymTLDiagB
+                Txy + Tyx,              // SymTLOffXY
+                Txz + Tzx,              // SymTLOffXZ
+                Tyz + Tzy,              // SymTLOffYZ
+            };
+
+            const double ABSOLUTE_DEVIATION_THRESHOLD = 5.0E-14;
+            const double RELATIVE_DEVIATION_THRESHOLD = 1.0E-9;
+            for (auto comp = 0; comp < 9; ++comp) {
+              auto abs_err = abs(ref_op_coulomb_op[comp] - results[comp][ijkl]);
+              auto rel_abs_err = abs(abs_err / ref_op_coulomb_op[comp]);
+              bool not_ok = rel_abs_err > RELATIVE_DEVIATION_THRESHOLD &&
+                            abs_err > ABSOLUTE_DEVIATION_THRESHOLD;
+              if (not_ok) {
+                std::cout << "(l0 l1| l2 l3) = (" << s0 << " " << s1 << " | "
+                          << s2 << " " << s3 << ") Elem " << ijkl
+                          << " comp= " << comp
+                          << " : ref = " << ref_op_coulomb_op[comp]
+                          << " libint = " << results[comp][ijkl]
+                          << " relabs_error = " << rel_abs_err
+                          << " abs_error = " << abs_err << std::endl;
+              }
+              REQUIRE(!not_ok);
+            }
+
+            ++ijkl;
+            END_FOR_CART
+            END_FOR_CART
+            END_FOR_CART
+            END_FOR_CART
+          }
+        }
+      }
+    }
+  }
+}
+
 TEST_CASE("Erfx_Coulomb integrals", "[engine][2-body]") {
   // pseudorandom s shells
   std::vector<Shell> obs{
@@ -374,12 +806,12 @@ TEST_CASE("Erfx_Coulomb integrals", "[engine][2-body]") {
       REQUIRE(results[0] != nullptr);
       switch (k) {
         /* VALIDATION WOLFRAM CODE:
-(* Integral of Coulomb kernel damped by (\[Lambda] Erf[\[Omega] r] + \
-\[Sigma] Erfc[\[Omega] r]), over unit-normalized s functions, \
-see Eq 52 in DOI 10.1039/b605188j *)
-F0[T_] := If[T == 0, 1, Sqrt[\[Pi]/T]*Erf[Sqrt[T]]/2];
-sN[a_] := ((2 a)/\[Pi])^(3/4);
-VVeeErfx[\[Alpha]1_, A1_List, \[Alpha]2_, A2_List, \[Beta]1_,
+ (* Integral of Coulomb kernel damped by (\[Lambda] Erf[\[Omega] r] + \
+ \[Sigma] Erfc[\[Omega] r]), over unit-normalized s functions, \
+ see Eq 52 in DOI 10.1039/b605188j *)
+ F0[T_] := If[T == 0, 1, Sqrt[\[Pi]/T]*Erf[Sqrt[T]]/2];
+ sN[a_] := ((2 a)/\[Pi])^(3/4);
+ VVeeErfx[\[Alpha]1_, A1_List, \[Alpha]2_, A2_List, \[Beta]1_,
    B1_List, \[Beta]2_, B2_List, \[Omega]_, \[Lambda]_, \[Sigma]_] :=
   Module[{\[Gamma]1, \[Gamma]2, P1, P2, K1, K2, T, result, \[Rho]},
    \[Gamma]1 = \[Alpha]1 + \[Beta]1;
@@ -397,13 +829,13 @@ VVeeErfx[\[Alpha]1_, A1_List, \[Alpha]2_, A2_List, \[Beta]1_,
            T]) sN[\[Alpha]1] sN[\[Alpha]2] sN[\[Beta]1] sN[\[Beta]2];
    Return[result];
    ];
-Print[CForm[
+ Print[CForm[
   N[VVeeErfx[1, {0, 0, 0},  3, {2, 2, 2}, 2, {1, 1, 1}, 4, {3, 3, 3},
     1.1, 1, 0], 20]]]
-Print[CForm[
+ Print[CForm[
   N[VVeeErfx[1, {0, 0, 0},  3, {2, 2, 2}, 2, {1, 1, 1}, 4, {3, 3, 3},
     1.1, 0, 1], 20]]]
-Print[CForm[
+ Print[CForm[
   N[VVeeErfx[1, {0, 0, 0},  3, {2, 2, 2}, 2, {1, 1, 1}, 4, {3, 3, 3},
     1.1, 2, 3], 20]]]
          */
diff --git a/include/libint2.h b/include/libint2.h
index 59d17ef65..25a231be7 100644
--- a/include/libint2.h
+++ b/include/libint2.h
@@ -22,18 +22,19 @@
 #define _libint2_header_
 
 #define LIBINT_T_SS_EREP_SS(mValue) \
-  _aB_s___0__s___1___TwoPRep_s___0__s___1___Ab__up_##mValue
+  _aB_s____0__s____1___TwoPRep_s____0__s____1___Ab__up_##mValue
 #define LIBINT_T_SS_Km1G12_SS(mValue) \
-  _aB_s___0__s___1___r12_minus_1_g12_s___0__s___1___Ab__up_##mValue
+  _aB_s____0__s____1___r12_minus_1_g12_s____0__s____1___Ab__up_##mValue
 #define LIBINT_T_SS_K0G12_SS_0 \
-  _aB_s___0__s___1___r12_0_g12_s___0__s___1___Ab__up_0
+  _aB_s____0__s____1___r12_0_g12_s____0__s____1___Ab__up_0
 #define LIBINT_T_SS_K2G12_SS_0 \
-  _aB_s___0__s___1___r12_2_g12_s___0__s___1___Ab__up_0
+  _aB_s____0__s____1___r12_2_g12_s____0__s____1___Ab__up_0
 #define LIBINT_T_SS_K4G12_SS_0 \
-  _aB_s___0__s___1___r12_4_g12_s___0__s___1___Ab__up_0
-#define LIBINT_T_S_OVERLAP_S _aB_s___0___Overlap_s___0___Ab__up_
-#define LIBINT_T_S_KINETIC_S _aB_s___0___Kinetic_s___0___Ab__up_
-#define LIBINT_T_S_ELECPOT_S(mValue) _aB_s___0___ElecPot_s___0___Ab__up_##mValue
+  _aB_s____0__s____1___r12_4_g12_s____0__s____1___Ab__up_0
+#define LIBINT_T_S_OVERLAP_S _aB_s____0___Overlap_s____0___Ab__up_
+#define LIBINT_T_S_KINETIC_S _aB_s____0___Kinetic_s____0___Ab__up_
+#define LIBINT_T_S_ELECPOT_S(mValue) \
+  _aB_s____0___ElecPot_s____0___Ab__up_##mValue
 
 #include <libint2/util/configuration.h>
 #include <libint2/util/generated/libint2_params.h>
diff --git a/include/libint2/config.h.cmake.in b/include/libint2/config.h.cmake.in
index 640e68099..3eb32e2f0 100644
--- a/include/libint2/config.h.cmake.in
+++ b/include/libint2/config.h.cmake.in
@@ -71,6 +71,13 @@
 #undef LIBINT_INCLUDE_ERI
 #endif
 
+/* Support ERI derivatives up to this order */
+#define LIBINT_INCLUDE_RKB_ERI @LIBINT_INCLUDE_RKB_ERI@
+#if @LIBINT_INCLUDE_RKB_ERI@ == -1
+#undef LIBINT_INCLUDE_RKB_ERI
+#endif
+
+
 /* Support 3-center ERI derivatives up to this order */
 #define LIBINT_INCLUDE_ERI3 @LIBINT_INCLUDE_ERI3@
 #if @LIBINT_INCLUDE_ERI3@ == -1
@@ -122,6 +129,18 @@
 /* Max optimized AM for ERI and its derivatives */
 #cmakedefine LIBINT_ERI_OPT_AM_LIST "@LIBINT_ERI_OPT_AM_LIST@"
 
+/* Max AM for RKB_ERI (same for all derivatives; if not defined see LIBINT_RKB_ERI_MAX_AM_LIST) */
+#cmakedefine LIBINT_RKB_ERI_MAX_AM @LIBINT_RKB_ERI_MAX_AM@
+
+/* Max AM for RKB_ERI and its derivatives */
+#cmakedefine LIBINT_RKB_ERI_MAX_AM_LIST "@LIBINT_RKB_ERI_MAX_AM_LIST@"
+
+/* Max optimized AM for ERI (same for all derivatives; if not defined see LIBINT_RKB_ERI_OPT_AM_LIST) */
+#cmakedefine LIBINT_RKB_ERI_OPT_AM @LIBINT_RKB_ERI_OPT_AM@
+
+/* Max optimized AM for ERI and its derivatives */
+#cmakedefine LIBINT_RKB_ERI_OPT_AM_LIST "@LIBINT_RKB_ERI_OPT_AM_LIST@"
+
 /* Max AM for 3-center ERI (same for all derivatives; if not defined see LIBINT_ERI3_MAX_AM_LIST) */
 #cmakedefine LIBINT_ERI3_MAX_AM @LIBINT_ERI3_MAX_AM@
 
diff --git a/include/libint2/cxxapi.h b/include/libint2/cxxapi.h
index 0aceb509b..22686f958 100644
--- a/include/libint2/cxxapi.h
+++ b/include/libint2/cxxapi.h
@@ -35,9 +35,9 @@
 
 #if !defined(LIBINT_INCLUDE_ONEBODY) ||                              \
     !(defined(LIBINT_INCLUDE_ERI) || defined(LIBINT_INCLUDE_ERI3) || \
-      defined(LIBINT_INCLUDE_ERI2))
+      defined(LIBINT_INCLUDE_ERI2) || defined(LIBINT_INCLUDE_RKB_ERI))
 #error \
-    "C++ API is only supported if both 1-body and some (eri, eri3, eri2) 2-body integrals are enabled"
+    "C++ API is only supported if both 1-body and some (eri, eri3, eri2, rkb_eri) 2-body integrals are enabled"
 #endif
 
 #include <libint2/atom.h>
diff --git a/include/libint2/engine.h b/include/libint2/engine.h
index 8e1a30ed1..0e04b085d 100644
--- a/include/libint2/engine.h
+++ b/include/libint2/engine.h
@@ -192,12 +192,32 @@ enum class Operator {
   /// \sa operator_traits<Operator::op_q_gau_op>
   /// \sa Surjuse, Deng, Asadchev, Valeev, arXiv:2603.16989 (2025)
   op_q_gau_op,
+  /// (1-body) σp . r . σp, the σ·p-on-both-sides analog of the dipole moment.
+  /// Produces 12 components = 3 dipole directions × 4 Pauli quaternion
+  /// components (trace + 3 antisym), indexed as `4*k + q` with `k ∈ {x,y,z}`
+  /// the dipole direction and `q ∈ {0=trace, 1=σ_x, 2=σ_y, 3=σ_z}` the Pauli
+  /// piece. Origin set via `engine.set_params(std::array<double,3>)`.
+  oprop,
   /// \f$ \delta(\vec{r}_1 - \vec{r}_2) \f$
   delta,
   /// (2-body) Coulomb operator = \f$ r_{12}^{-1} \f$
   coulomb,
   /// alias for Operator::coulomb
   r12_m1 = coulomb,
+  /// (2-body) \f$ r_{12}^{-1} (σ.p_{k1})(σ.p_{k2})\f$ where k1  & k2 are
+  /// centers of ket1 and ket2, respectively
+  coulomb_opop,
+  /// (2-body) \f$ (σ.p_{b1})(σ.p_{b2}) r_{12}^{-1} (σ.p_{k1})(σ.p_{k2})\f$
+  /// where b1 & b2 are centers of bra1 and bra2 and k1  & k2 are centers of
+  /// ket1 and ket2, respectively
+  opop_coulomb_opop,
+  /// (2-body) \f$ (σ.p_{b2}) r_{12}^{-1} (σ.p_{k2}) \f$ where b2 is the center
+  /// of bra2 and k2 is the center of ket2; Gaunt LS "bilinear" integral.
+  /// Produces 9 components (outer product of two Cartesian directions),
+  /// indexed as `3*a + b` with `a` = bra-side direction, `b` = ket-side
+  /// direction, and `a,b ∈ {x=0, y=1, z=2}`. Unlike coulomb_opop, the 9
+  /// components are NOT contracted via σ·σ — all are kept independent.
+  op_coulomb_op,
   /// contracted Gaussian geminal
   cgtg,
   /// contracted Gaussian geminal times Coulomb
@@ -230,7 +250,7 @@ enum class Operator {
   // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!keep this
   // updated!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   first_1body_oper = overlap,
-  last_1body_oper = op_q_gau_op,
+  last_1body_oper = oprop,
   first_2body_oper = delta,
   last_2body_oper = stg_x_coulomb,
   first_oper = first_1body_oper,
@@ -291,6 +311,7 @@ struct operator_traits<Operator::nuclear>
   typedef const libint2::FmEval_Reference<scalar_type> core_eval_type;
 #endif
 };
+
 /// opVop: 4 quaternionic components (scalar, x, y, z) of (σ·p)V(σ·p).
 /// Inherits point-charge params from nuclear; uses Boys function core eval.
 template <>
@@ -415,6 +436,13 @@ struct operator_traits<Operator::sphemultipole>
       (LIBINT_MULTIPOLE_MAX_ORDER + 1) * (LIBINT_MULTIPOLE_MAX_ORDER + 1);
 };
 
+template <>
+struct operator_traits<Operator::oprop>
+    : public operator_traits<Operator::emultipole1> {
+  static constexpr auto nopers = 12;
+  static constexpr auto intrinsic_deriv_order = 2;
+};
+
 template <>
 struct operator_traits<Operator::coulomb>
     : public detail::default_operator_traits {
@@ -424,6 +452,31 @@ struct operator_traits<Operator::coulomb>
   typedef const libint2::FmEval_Reference<scalar_type> core_eval_type;
 #endif
 };
+
+template <>
+struct operator_traits<Operator::coulomb_opop>
+    : public operator_traits<Operator::coulomb> {
+  static constexpr auto nopers = 4;
+  static constexpr auto intrinsic_deriv_order = 2;
+};
+template <>
+struct operator_traits<Operator::opop_coulomb_opop>
+    : public operator_traits<Operator::coulomb> {
+  /// 16 components: tensor product of two independent spin-space quaternions
+  /// index = 4 * bra_spin + ket_spin, where spin in {S=0, X=1, Y=2, Z=3}
+  static constexpr auto nopers = 16;
+  static constexpr auto intrinsic_deriv_order = 4;
+};
+template <>
+struct operator_traits<Operator::op_coulomb_op>
+    : public operator_traits<Operator::coulomb> {
+  /// 9 components: Cartesian dyadic of the two (σ·p) directions.
+  /// index = 3 * a + b, with a = bra-side direction, b = ket-side direction,
+  /// a,b ∈ {x=0, y=1, z=2}.
+  static constexpr auto nopers = 9;
+  static constexpr auto intrinsic_deriv_order = 2;
+};
+
 namespace detail {
 template <int K>
 struct cgtg_operator_traits : public detail::default_operator_traits {
@@ -917,16 +970,16 @@ class Engine {
       const Shell& ket2, const ShellPair* spbra, const ShellPair* spket);
 
   // clang-format off
-  /** this specifies target precision for computing the integrals, i.e.
-   *  the target absolute (i.e., not relative) error of the integrals.
-   *  It is used to screen out primitive integrals. For some screening
-   *  methods precision can be almost guaranteed (due to finite precision
-   *  of the precomputed interpolation tables used to evaluate the core integrals
-   *  it is not in general possible to guarantee precision rigorously).
-   *
-   *  @param[in] prec the target precision
-   *  @sa ScreeningMethod
-   */
+ /** this specifies target precision for computing the integrals, i.e.
+  *  the target absolute (i.e., not relative) error of the integrals.
+  *  It is used to screen out primitive integrals. For some screening
+  *  methods precision can be almost guaranteed (due to finite precision
+  *  of the precomputed interpolation tables used to evaluate the core integrals
+  *  it is not in general possible to guarantee precision rigorously).
+  *
+  *  @param[in] prec the target precision
+  *  @sa ScreeningMethod
+  */
   // clang-format on
   Engine& set_precision(scalar_type prec) {
     if (prec <= 0.) {
diff --git a/include/libint2/engine.impl.h b/include/libint2/engine.impl.h
index b200b6945..01269538d 100644
--- a/include/libint2/engine.impl.h
+++ b/include/libint2/engine.impl.h
@@ -70,40 +70,44 @@ typename std::remove_all_extents<T>::type* to_ptr1(T (&a)[N]) {
 /// These MUST appear in the same order as in Operator.
 /// You must also update BOOST_PP_NBODY_OPERATOR_LAST_ONEBODY_INDEX when you add
 /// one-body ints
-#define BOOST_PP_NBODY_OPERATOR_LIST                \
-  (overlap,                    /* overlap */        \
-   (kinetic,                   /* kinetic */        \
-    (elecpot,                  /* nuclear */        \
-     (elecpot,                 /* erf_nuclear */    \
-      (elecpot,                /* erfc_nuclear */   \
-       (elecpot,               /* erfx_nuclear */   \
-        (elecpot,              /* q_gau */          \
-         (1emultipole,         /* emultipole1 */    \
-          (2emultipole,        /* emultipole2 */    \
-           (3emultipole,       /* emultipole3 */    \
-            (sphemultipole,    /* sphemultipole */  \
-             (opVop,           /* opVop */          \
-              (opVop,          /* op_q_gau_op */    \
-               (eri,           /* delta */          \
-                (eri,          /* coulomb */        \
-                 (eri,         /* cgtg */           \
-                  (eri,        /* cgtg_x_coulomb */ \
-                   (eri,       /* delcgtg2 */       \
-                    (eri,      /* r12 */            \
-                     (eri,     /* erf_coulomb */    \
-                      (eri,    /* erfc_coulomb */   \
-                       (eri,   /* erfx_coulomb */   \
-                        (eri,  /* stg */            \
-                         (eri, /* yukawa */         \
-                          BOOST_PP_NIL))))))))))))))))))))))))
+#define BOOST_PP_NBODY_OPERATOR_LIST                         \
+  (overlap,                          /* overlap */           \
+   (kinetic,                         /* kinetic */           \
+    (elecpot,                        /* nuclear */           \
+     (elecpot,                       /* erf_nuclear */       \
+      (elecpot,                      /* erfc_nuclear */      \
+       (elecpot,                     /* erfx_nuclear */      \
+        (elecpot,                    /* q_gau */             \
+         (1emultipole,               /* emultipole1 */       \
+          (2emultipole,              /* emultipole2 */       \
+           (3emultipole,             /* emultipole3 */       \
+            (sphemultipole,          /* sphemultipole */     \
+             (opVop,                 /* opVop */             \
+              (opVop,                /* op_q_gau_op */       \
+               (oprop,                /* oprop */            \
+                (eri,                  /* delta */           \
+                 (eri,                 /* coulomb */         \
+                  (coulomb_opop,       /* coulomb_opop */    \
+                   (opop_coulomb_opop, /* opop_coulomb_opop */ \
+                    (op_coulomb_op,    /* op_coulomb_op */   \
+                     (eri,             /* cgtg */            \
+                      (eri,            /* cgtg_x_coulomb */  \
+                       (eri,           /* delcgtg2 */        \
+                        (eri,          /* r12 */             \
+                         (eri,         /* erf_coulomb */     \
+                          (eri,        /* erfc_coulomb */    \
+                           (eri,       /* erfx_coulomb */    \
+                            (eri,      /* stg */             \
+                             (eri,     /* yukawa */          \
+                              BOOST_PP_NIL))))))))))))))))))))))))))))
 
 #define BOOST_PP_NBODY_OPERATOR_INDEX_TUPLE \
   BOOST_PP_MAKE_TUPLE(BOOST_PP_LIST_SIZE(BOOST_PP_NBODY_OPERATOR_LIST))
 #define BOOST_PP_NBODY_OPERATOR_INDEX_LIST \
   BOOST_PP_TUPLE_TO_LIST(BOOST_PP_NBODY_OPERATOR_INDEX_TUPLE)
 #define BOOST_PP_NBODY_OPERATOR_LAST_ONEBODY_INDEX \
-  12  // op_q_gau_op, the 13th member of BOOST_PP_NBODY_OPERATOR_LIST, is the
-      // last 1-body operator
+  13  // oprop, the 14th member of BOOST_PP_NBODY_OPERATOR_LIST, is the last
+      // 1-body operator
 
 // make list of braket indices for n-body ints
 #define BOOST_PP_NBODY_BRAKET_INDEX_TUPLE \
@@ -668,23 +672,23 @@ __libint2_engine_inline void Engine::initialize(size_t max_nprim) {
   // validate braket
 #ifndef LIBINT_INCLUDE_ONEBODY
   assert(braket_ != BraKet::x_x &&
-         "this braket type not supported by the library; give --enable-1body "
-         "to configure");
+         "this braket type not supported by the library; configure with "
+         "-DLIBINT2_ENABLE_ONEBODY >= 0");
 #endif
 #ifndef LIBINT_INCLUDE_ERI
   assert(braket_ != BraKet::xx_xx &&
-         "this braket type not supported by the library; give --enable-eri to "
-         "configure");
+         "this braket type not supported by the library; configure with "
+         "-DLIBINT2_ENABLE_ERI >= 0");
 #endif
 #ifndef LIBINT_INCLUDE_ERI3
   assert((braket_ != BraKet::xs_xx && braket_ != BraKet::xx_xs) &&
-         "this braket type not supported by the library; give --enable-eri3 to "
-         "configure");
+         "this braket type not supported by the library; configure with "
+         "-DLIBINT2_ENABLE_ERI3 >= 0");
 #endif
 #ifndef LIBINT_INCLUDE_ERI2
   assert(braket_ != BraKet::xs_xs &&
-         "this braket type not supported by the library; give --enable-eri2 to "
-         "configure");
+         "this braket type not supported by the library; configure with "
+         "-DLIBINT2_ENABLE_ERI2 >= 0");
 #endif
 
   // make sure it's no default initialized
@@ -706,6 +710,7 @@ __libint2_engine_inline void Engine::initialize(size_t max_nprim) {
     // target indices.
     const auto permutable_targets =
         deriv_order_ > 0 &&
+
         (braket_ == BraKet::xx_xx || braket_ == BraKet::xs_xx ||
          braket_ == BraKet::xx_xs);
     if (permutable_targets)
@@ -1050,7 +1055,7 @@ __libint2_engine_inline void Engine::compute_primdata(Libint_t& primdata,
   //  }
 
   if (oper_ == Operator::emultipole1 || oper_ == Operator::emultipole2 ||
-      oper_ == Operator::emultipole3) {
+      oper_ == Operator::emultipole3 || oper_ == Operator::oprop) {
     const auto& O = any_cast<
         const operator_traits<Operator::emultipole1>::oper_params_type&>(
         params_);  // same as emultipoleX
@@ -1098,7 +1103,8 @@ __libint2_engine_inline void Engine::compute_primdata(Libint_t& primdata,
   primdata._0_Overlap_0_z[0] = ovlp_ss_z;
 
   if (oper_ == Operator::kinetic || (deriv_order_ > 0) ||
-      oper_ == Operator::opVop || oper_ == Operator::op_q_gau_op) {
+      oper_ == Operator::opVop || oper_ == Operator::op_q_gau_op ||
+      oper_ == Operator::oprop) {
 #if LIBINT2_DEFINED(eri, two_alpha0_bra)
     primdata.two_alpha0_bra[0] = 2.0 * alpha1;
 #endif
@@ -1262,21 +1268,96 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
 
 #if LIBINT2_SHELLQUARTET_SET == \
     LIBINT2_SHELLQUARTET_SET_STANDARD  // standard angular momentum ordering
-  const auto swap_tbra = (tbra1.contr[0].l < tbra2.contr[0].l);
-  const auto swap_tket = (tket1.contr[0].l < tket2.contr[0].l);
-  const auto swap_braket =
-      ((braket_ == BraKet::xx_xx) && (tbra1.contr[0].l + tbra2.contr[0].l >
-                                      tket1.contr[0].l + tket2.contr[0].l)) ||
-      braket_ == BraKet::xx_xs;
+  bool swap_braket;
+  bool swap_tbra, swap_tket;
+  if (oper_ == Operator::opop_coulomb_opop) {
+    // For σpσpCoulombσpσp: (ab|cd) = (cd|ab) = (ba|dc)* = (dc|ba)*
+    // Canonical form: lc >= ld (or la >= lb when lc == ld),
+    // la+lb <= lc+ld (or max(la,lb) <= lc when sums equal)
+    const auto bra_total = tbra1.contr[0].l + tbra2.contr[0].l;
+    const auto ket_total = tket1.contr[0].l + tket2.contr[0].l;
+    const auto bra_max = std::max(tbra1.contr[0].l, tbra2.contr[0].l);
+    const auto ket_max = std::max(tket1.contr[0].l, tket2.contr[0].l);
+    swap_braket = ((braket_ == BraKet::xx_xx) &&
+                   (bra_total > ket_total ||
+                    (bra_total == ket_total && bra_max > ket_max))) ||
+                  braket_ == BraKet::xx_xs;
+    // Coupled swap: after braket swap, sort the pair that ends up in ket
+    // position to ensure lc >= ld; when lc == ld, also sort bra (la >= lb)
+    if (swap_braket) {
+      // After braket swap: new ket = original bra, new bra = original ket.
+      // Coupled swap sorts new ket (ensure lc >= ld).
+      const bool swap_p1p2 = (tbra1.contr[0].l < tbra2.contr[0].l);
+      swap_tbra = swap_tket = swap_p1p2;
+    } else {
+      // No braket swap: ket stays as original ket.
+      // Coupled swap sorts ket (ensure lc >= ld).
+      const bool swap_p1p2 = (tket1.contr[0].l < tket2.contr[0].l);
+      swap_tbra = swap_tket = swap_p1p2;
+    }
+  } else if (oper_ == Operator::op_coulomb_op) {
+    // σpCoulombσp: only bra↔ket (particle 1↔2) swap is a symmetry (with
+    // (a,b)↔(b,a) component remap). Within-side swap is NOT a symmetry
+    // because σ·p attaches to one specific function per side; moving it to
+    // the other function changes the integral in a way IBP cannot recover
+    // across electrons. Canonical form: la+lb <= lc+ld only.
+    const auto bra_total = tbra1.contr[0].l + tbra2.contr[0].l;
+    const auto ket_total = tket1.contr[0].l + tket2.contr[0].l;
+    swap_braket = ((braket_ == BraKet::xx_xx) && (bra_total > ket_total)) ||
+                  braket_ == BraKet::xx_xs;
+    swap_tbra = false;
+    swap_tket = false;
+  } else {
+    swap_braket = ((braket_ == BraKet::xx_xx) &&
+                   (tbra1.contr[0].l + tbra2.contr[0].l >
+                    tket1.contr[0].l + tket2.contr[0].l) &&
+                   (oper_ != Operator::coulomb_opop)) ||
+                  braket_ == BraKet::xx_xs;
+    swap_tbra = (tbra1.contr[0].l < tbra2.contr[0].l);
+    swap_tket = (tket1.contr[0].l < tket2.contr[0].l);
+  }
+
+  // N.B. cannot swap bra and ket for coulomb_opop since the ket is mutated by
+  // this operator
 #else  // orca angular momentum ordering
-  const auto swap_tbra = (tbra1.contr[0].l > tbra2.contr[0].l);
-  const auto swap_tket = (tket1.contr[0].l > tket2.contr[0].l);
-  const auto swap_braket =
-      ((braket_ == BraKet::xx_xx) && (tbra1.contr[0].l + tbra2.contr[0].l <
-                                      tket1.contr[0].l + tket2.contr[0].l)) ||
-      braket_ == BraKet::xx_xs;
-  assert(false && "feature not implemented");
-  abort();
+  bool swap_braket;
+  bool swap_tbra, swap_tket;
+  if (oper_ == Operator::opop_coulomb_opop) {
+    // ORCA canonical for σpσpCoulombσpσp: lc <= ld (or la <= lb when lc == ld),
+    // la+lb >= lc+ld (or min(la,lb) >= lc when sums equal)
+    const auto bra_total = tbra1.contr[0].l + tbra2.contr[0].l;
+    const auto ket_total = tket1.contr[0].l + tket2.contr[0].l;
+    const auto bra_min = std::min(tbra1.contr[0].l, tbra2.contr[0].l);
+    const auto ket_min = std::min(tket1.contr[0].l, tket2.contr[0].l);
+    swap_braket = ((braket_ == BraKet::xx_xx) &&
+                   (bra_total < ket_total ||
+                    (bra_total == ket_total && bra_min < ket_min))) ||
+                  braket_ == BraKet::xx_xs;
+    if (swap_braket) {
+      const bool swap_p1p2 = (tbra1.contr[0].l > tbra2.contr[0].l);
+      swap_tbra = swap_tket = swap_p1p2;
+    } else {
+      const bool swap_p1p2 = (tket1.contr[0].l > tket2.contr[0].l);
+      swap_tbra = swap_tket = swap_p1p2;
+    }
+  } else if (oper_ == Operator::op_coulomb_op) {
+    // σpCoulombσp: only bra↔ket swap is a symmetry (with (a,b)↔(b,a) remap).
+    // ORCA canonical form: la+lb >= lc+ld only.
+    const auto bra_total = tbra1.contr[0].l + tbra2.contr[0].l;
+    const auto ket_total = tket1.contr[0].l + tket2.contr[0].l;
+    swap_braket = ((braket_ == BraKet::xx_xx) && (bra_total < ket_total)) ||
+                  braket_ == BraKet::xx_xs;
+    swap_tbra = false;
+    swap_tket = false;
+  } else {
+    swap_tbra = (tbra1.contr[0].l > tbra2.contr[0].l);
+    swap_tket = (tket1.contr[0].l > tket2.contr[0].l);
+    swap_braket = ((braket_ == BraKet::xx_xx) &&
+                   (tbra1.contr[0].l + tbra2.contr[0].l <
+                    tket1.contr[0].l + tket2.contr[0].l) &&
+                   (oper_ != Operator::coulomb_opop)) ||
+                  braket_ == BraKet::xx_xs;
+  }
 #endif
   const auto& bra1 =
       swap_braket ? (swap_tket ? tket2 : tket1) : (swap_tbra ? tbra2 : tbra1);
@@ -1471,17 +1552,24 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
             const scalar_type rho = gammap * gammaq * oogammapq;
             const scalar_type T = PQ2 * rho;
             auto* gm_ptr = &(primdata.LIBINT_T_SS_EREP_SS(0)[0]);
-            const auto mmax = l + deriv_order_;
+            const auto mmax = l + deriv_order_ + intrinsic_deriv_order();
 
             if (!skip_core_ints) {
+// Simple core-eval dispatch: just `core_eval_ptr->eval(gm_ptr, T, mmax)`.
+// Applies to Coulomb-family operators whose core integral is the bare Fm.
+#define LIBINT2_SIMPLE_CORE_EVAL_CASE(OP)                           \
+  case Operator::OP: {                                              \
+    const auto& core_eval_ptr =                                     \
+        any_cast<const detail::core_eval_pack_type<Operator::OP>&>( \
+            core_eval_pack_)                                        \
+            .first();                                               \
+    core_eval_ptr->eval(gm_ptr, T, mmax);                           \
+  } break
               switch (oper_) {
-                case Operator::coulomb: {
-                  const auto& core_eval_ptr =
-                      any_cast<const detail::core_eval_pack_type<
-                          Operator::coulomb>&>(core_eval_pack_)
-                          .first();
-                  core_eval_ptr->eval(gm_ptr, T, mmax);
-                } break;
+                LIBINT2_SIMPLE_CORE_EVAL_CASE(coulomb);
+                LIBINT2_SIMPLE_CORE_EVAL_CASE(coulomb_opop);
+                LIBINT2_SIMPLE_CORE_EVAL_CASE(opop_coulomb_opop);
+                LIBINT2_SIMPLE_CORE_EVAL_CASE(op_coulomb_op);
                 case Operator::cgtg_x_coulomb: {
                   const auto& core_eval_ptr =
                       any_cast<const detail::core_eval_pack_type<
@@ -1600,6 +1688,7 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
                   assert(false && "missing case in a switch");  // unreachable
                   abort();
               }
+#undef LIBINT2_SIMPLE_CORE_EVAL_CASE
             }
 
             for (auto m = 0; m != mmax + 1; ++m) {
@@ -1701,7 +1790,7 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
               const auto Wz =
                   (gammap_o_gammapgammaq * P[2] + gammaq_o_gammapgammaq * Q[2]);
 
-              if (deriv_order_ > 0 || lmax_bra > 0) {
+              if (deriv_order_ + intrinsic_deriv_order() > 0 || lmax_bra > 0) {
 #if LIBINT2_DEFINED(eri, WP_x)
                 primdata.WP_x[0] = Wx - P[0];
 #endif
@@ -1712,7 +1801,7 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
                 primdata.WP_z[0] = Wz - P[2];
 #endif
               }
-              if (deriv_order_ > 0 || lmax_ket > 0) {
+              if (deriv_order_ + intrinsic_deriv_order() > 0 || lmax_ket > 0) {
 #if LIBINT2_DEFINED(eri, WQ_x)
                 primdata.WQ_x[0] = Wx - Q[0];
 #endif
@@ -1796,7 +1885,7 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
 #endif
 
               // prefactors for derivative ERI relations
-              if (deriv_order_ > 0) {
+              if (deriv_order_ + intrinsic_deriv_order() > 0) {
 #if LIBINT2_DEFINED(eri, alpha1_rho_over_zeta2)
                 primdata.alpha1_rho_over_zeta2[0] =
                     alpha0 * (oogammap * gammaq_o_gammapgammaq);
@@ -1879,7 +1968,8 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
   }
 
   // compute directly (ss|ss)
-  const auto compute_directly = lmax == 0 && deriv_order_ == 0;
+  const auto compute_directly =
+      lmax == 0 && deriv_order_ == 0 & intrinsic_deriv_order() == 0;
 
   if (compute_directly) {
 #ifdef LIBINT2_ENGINE_TIMERS
@@ -1957,8 +2047,10 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
                "the angular momentum limit is exceeded");
         assert(ket2.contr[0].l <= ket_lmax &&
                "the angular momentum limit is exceeded");
+
         buildfnidx = (bra1.contr[0].l * ket_lmax + ket1.contr[0].l) * ket_lmax +
                      ket2.contr[0].l;
+
 #ifdef LIBINT_ERI3_PURE_SH
         if (bra1.contr[0].l > 1)
           assert(bra1.contr[0].pure &&
@@ -2141,19 +2233,52 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
                     tgt_ptr + tgt_col_idx, nr1_tgt, nr2_tgt,
                     Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(
                         nr2_tgt * ncol_tgt, ncol_tgt));
+                // Coupled swap sign correction for multi-component operators
+                Shell::real_t oper_cart_component_phase = 1.0;
+                if (swap_tket && oper_ == Operator::opop_coulomb_opop) {
+                  const bool bra_is_spin = (s / 4) > 0;
+                  const bool ket_is_spin = (s % 4) > 0;
+                  if (bra_is_spin != ket_is_spin)
+                    oper_cart_component_phase = -1.0;
+                }
+                if (swap_tket && oper_ == Operator::coulomb_opop && s > 0)
+                  oper_cart_component_phase = -1.0;
+                // op_coulomb_op irrep layout under bra↔ket swap: antisym
+                // components (s ∈ {1,2,3}) flip sign; scalar (0) and sym-TL
+                // (4..8) are invariant. swap_tket is always false for this
+                // operator; the sign correction applies on swap_braket alone.
+                if (oper_ == Operator::op_coulomb_op && s >= 1 && s <= 3)
+                  oper_cart_component_phase = -1.0;
                 if (swap_tbra)
-                  tgt_blk_mat = src_blk_mat.transpose();
+                  tgt_blk_mat =
+                      oper_cart_component_phase * src_blk_mat.transpose();
                 else
-                  tgt_blk_mat = src_blk_mat;
+                  tgt_blk_mat = oper_cart_component_phase * src_blk_mat;
               } else {
                 // source row {r1,r2} is mapped to target row {r1,r2} if
                 // !swap_tbra, else to {r2,r1}
                 const auto tgt_row_idx =
                     !swap_tbra ? r1 * nr2 + r2 : r2 * nr1 + r1;
                 Map tgt_blk_mat(tgt_ptr + tgt_row_idx * ncol, nc1_tgt, nc2_tgt);
-                if (swap_tket)
-                  tgt_blk_mat = src_blk_mat.transpose();
-                else
+                if (swap_tket) {
+                  Shell::real_t oper_cart_component_phase = 1.0;
+                  if (oper_ == Operator::opop_coulomb_opop) {
+                    // Option A ordering: index = 4*bra + ket
+                    // Coupled swap (a<->b AND c<->d) flips sign when exactly
+                    // one of bra/ket is a cross product (spin != S):
+                    //   bra_spin = s/4, ket_spin = s%4 (0=S, 1-3=X/Y/Z)
+                    const bool bra_is_spin = (s / 4) > 0;
+                    const bool ket_is_spin = (s % 4) > 0;
+                    if (bra_is_spin != ket_is_spin)
+                      oper_cart_component_phase = -1.0;
+                  }
+                  if (oper_ == Operator::coulomb_opop && s > 0)
+                    oper_cart_component_phase =
+                        -1.0;  // x,y,z quaternion components flip sign on
+                               // swapping ket for coulomb_opop
+                  tgt_blk_mat =
+                      oper_cart_component_phase * src_blk_mat.transpose();
+                } else
                   tgt_blk_mat = src_blk_mat;
               }
             }  // end of loop
@@ -2178,7 +2303,20 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
                                                     // to primdata_[0].targets
           targets_[s_target] = source;
         }
-      }     // loop over shellsets
+      }  // loop over shellsets
+
+      // For opop_coulomb_opop with swap_braket: swapping particles remaps
+      // component (α,β) → (β,α). With Option A ordering (index=4*bra+ket),
+      // this is a matrix transpose: s_new = 4*(s%4) + (s/4).
+      if (permute && oper_ == Operator::opop_coulomb_opop && swap_braket) {
+        std::array<const value_type*, 16> temp;
+        for (auto s = 0; s != ntargets; ++s) temp[s] = targets_[s];
+        for (auto s = 0; s != ntargets; ++s)
+          targets_[4 * (s % 4) + (s / 4)] = temp[s];
+      }
+      // op_coulomb_op irrep layout: bra↔ket swap is handled in-place by
+      // oper_cart_component_phase above (sign flip on antisym components,
+      // identity on scalar / sym-TL); no pointer remap is needed.
     }       // if need_scratch => needed to transpose and/or tform
     else {  // did not use scratch? may still need to update targets_
       if (set_targets_) {
diff --git a/src/bin/libint/build_libint.cc b/src/bin/libint/build_libint.cc
index 8b5ece9ea..67700a630 100644
--- a/src/bin/libint/build_libint.cc
+++ b/src/bin/libint/build_libint.cc
@@ -31,7 +31,9 @@
   */
 
 #include <boost/preprocessor.hpp>
+#include <chrono>
 #include <fstream>
+#include <iomanip>
 #include <iostream>
 #include <limits>
 #include <vector>
@@ -66,6 +68,8 @@
 using namespace std;
 using namespace libint2;
 
+CodeGenProgress g_progress;
+
 enum ShellSetType {
   ShellSetType_Standard = LIBINT_SHELL_SET_STANDARD,
   ShellSetType_ORCA = LIBINT_SHELL_SET_ORCA
@@ -73,18 +77,39 @@ enum ShellSetType {
 template <ShellSetType ShSet>
 struct ShellQuartetSetPredicate {
   // return true if this set of angular momenta is included
-  static bool value(int la, int lb, int lc, int ld);
+  static bool value(int la, int lb, int lc, int ld, bool p1p2_swappable = true);
 };
+
+/**
+ * standard ordering for angular momenta la, lb, lc, ld
+ * @param p1p2_swappable whether operator allows swaps of particle 1 and 2
+ * functions (e.g., not allowed for Coulombσpσp but allowed
+ * for Coulomb (TwoPRep)).
+ * @param bra_ket_coswappable whether need to swap within both bra and ket.
+ * Not individually swapping of either ket of bra allowed
+ * ( e.g., for σpσpCoulombσpσp)
+ */
 template <>
 struct ShellQuartetSetPredicate<ShellSetType_Standard> {
-  static bool value(int la, int lb, int lc, int ld) {
-    return la >= lb && lc >= ld && la + lb <= lc + ld;
+  static bool value(int la, int lb, int lc, int ld, bool p1p2_swappable = true,
+                    bool bra_ket_coswappable = false) {
+    if (bra_ket_coswappable)
+      return lc >= ld && (la + lb < lc + ld ||
+                          (la + lb == lc + ld && std::max(la, lb) <= lc));
+    else
+      return la >= lb && lc >= ld && (!p1p2_swappable || la + lb <= lc + ld);
   }
 };
 template <>
 struct ShellQuartetSetPredicate<ShellSetType_ORCA> {
-  static bool value(int la, int lb, int lc, int ld) {
-    return la <= lb && lc <= ld && (la < lc || (la == lc && lb <= ld));
+  static bool value(int la, int lb, int lc, int ld, bool p1p2_swappable = true,
+                    bool bra_ket_coswappable = false) {
+    if (bra_ket_coswappable)
+      return lc <= ld && (la + lb > lc + ld ||
+                          (la + lb == lc + ld && std::min(la, lb) >= lc));
+    else
+      return la <= lb && lc <= ld &&
+             (!p1p2_swappable || (la < lc || (la == lc && lb <= ld)));
   }
 };
 template <ShellSetType ShSet>
@@ -192,14 +217,20 @@ static void config_to_api(const std::shared_ptr<CompilationParameters>& cparams,
 
 #ifdef LIBINT_INCLUDE_ERI
 #define USE_GENERIC_ERI_BUILD 1
-#if !USE_GENERIC_ERI_BUILD
+#endif
+#if defined(LIBINT_INCLUDE_ERI) || defined(LIBINT_INCLUDE_RKB_ERI)
+#if defined(USE_GENERIC_ERI_BUILD) && !USE_GENERIC_ERI_BUILD
+template <typename OperType>
 static void build_TwoPRep_2b_2k(
-    std::ostream& os, const std::shared_ptr<CompilationParameters>& cparams,
+    std::ostream& os, std::string label,
+    const std::shared_ptr<CompilationParameters>& cparams,
     std::shared_ptr<Libint2Iface>& iface);
 #else
+template <typename OperType>
 static void build_TwoPRep_2b_2k(
-    std::ostream& os, const std::shared_ptr<CompilationParameters>& cparams,
-    std::shared_ptr<Libint2Iface>& iface, unsigned int deriv_level);
+    std::ostream& os, std::string label,
+    const std::shared_ptr<CompilationParameters>& cparams,
+    std::shared_ptr<Libint2Iface>& iface, unsigned int deriv_level = 0);
 #endif
 #endif
 
@@ -249,6 +280,8 @@ struct AuxQuantaType {
   typedef EmptySet type;
 };
 
+}  // namespace
+
 template <typename OperDescrType>
 OperDescrType make_descr(int, int = 0, int = 0) {
   return OperDescrType();
@@ -274,7 +307,21 @@ template <>
 σpVσp_Descr make_descr<σpVσp_Descr>(int p, int, int) {
   return σpVσp_Descr(p);
 }
-}  // namespace
+
+template <>
+σpRσp_Descr make_descr<σpRσp_Descr>(int p, int, int) {
+  return σpRσp_Descr(p);
+}
+
+template <>
+Coulombσpσp_Descr make_descr<Coulombσpσp_Descr>(int p, int, int) {
+  return Coulombσpσp_Descr(p);
+}
+
+template <>
+σpσpCoulombσpσp_Descr make_descr<σpσpCoulombσpσp_Descr>(int p, int, int) {
+  return σpσpCoulombσpσp_Descr(p);
+}
 
 template <typename _OperType>
 void build_onebody_1b_1k(std::ostream& os, std::string label,
@@ -403,6 +450,14 @@ void build_onebody_1b_1k(std::ostream& os, std::string label,
           descrs.emplace_back(make_descr<OperDescrType>(p));
         }
       }
+      if (std::is_same<_OperType, σpRσpOper>::value) {
+        // reset descriptors array
+        descrs.resize(0);
+        // iterate over 12 = 3 dipole directions × 4 Pauli components
+        for (int p = 0; p != 12; ++p) {
+          descrs.emplace_back(make_descr<OperDescrType>(p));
+        }
+      }
 
       // derivative index is the outermost (slowest running)
       // operator component is second slowest
@@ -480,8 +535,8 @@ void build_onebody_1b_1k(std::ostream& os, std::string label,
         eval_label = oss.str();
       }
 
-      std::cout << "working on " << eval_label << " ... ";
-      std::cout.flush();
+      g_progress.current_task = eval_label;
+      g_progress.print();
 
       std::string prefix(cparams->source_directory());
       std::deque<std::string> decl_filenames;
@@ -521,8 +576,6 @@ void build_onebody_1b_1k(std::ostream& os, std::string label,
       dg->reset();
       memman->reset();
 
-      std::cout << "done" << std::endl;
-
     }  // end of b loop
   }    // end of a loop
 }
@@ -544,11 +597,11 @@ void try_main(int argc, char* argv[]) {
 // overlap, kinetic, elecpot cannot be omitted
 #define BOOST_PP_ONEBODY_TASK_TUPLE                                  \
   (overlap, kinetic, elecpot, 1emultipole, 2emultipole, 3emultipole, \
-   sphemultipole, opVop)
+   sphemultipole, opVop, oprop)
 #define BOOST_PP_ONEBODY_TASK_OPER_TUPLE                              \
   (OverlapOper, KineticOper, ElecPotOper, CartesianMultipoleOper<3u>, \
    CartesianMultipoleOper<3u>, CartesianMultipoleOper<3u>,            \
-   SphericalMultipoleOper, σpVσpOper)
+   SphericalMultipoleOper, σpVσpOper, σpRσpOper)
 #define BOOST_PP_ONEBODY_TASK_LIST \
   BOOST_PP_TUPLE_TO_LIST(BOOST_PP_ONEBODY_TASK_TUPLE)
 #define BOOST_PP_ONEBODY_TASK_OPER_LIST \
@@ -567,6 +620,26 @@ void try_main(int argc, char* argv[]) {
     taskmgr.add(task_label("eri", d));
   }
 #endif
+
+#ifdef LIBINT_INCLUDE_RKB_ERI
+#define BOOST_PP_RKB_ERI_TASK_TUPLE \
+  (coulomb_opop, opop_coulomb_opop, op_coulomb_op)
+#define BOOST_PP_RKB_ERI_TASK_OPER_TUPLE \
+  (CoulombσpσpOper, σpσpCoulombσpσpOper, σpCoulombσpOper)
+#define BOOST_PP_RKB_ERI_TASK_LIST \
+  BOOST_PP_TUPLE_TO_LIST(BOOST_PP_RKB_ERI_TASK_TUPLE)
+#define BOOST_PP_RKB_ERI_TASK_OPER_LIST \
+  BOOST_PP_TUPLE_TO_LIST(BOOST_PP_RKB_ERI_TASK_OPER_TUPLE)
+
+  for (unsigned int d = 0; d <= LIBINT_INCLUDE_RKB_ERI; ++d) {
+#define BOOST_PP_RKB_ERI_MCR1(r, data, elem) \
+  taskmgr.add(task_label(BOOST_PP_STRINGIZE(elem), d));
+
+    BOOST_PP_LIST_FOR_EACH(BOOST_PP_RKB_ERI_MCR1, _, BOOST_PP_RKB_ERI_TASK_LIST)
+#undef BOOST_PP_RKB_ERI_MCR1
+  }
+#endif
+
 #ifdef LIBINT_INCLUDE_ERI3
   for (unsigned int d = 0; d <= LIBINT_INCLUDE_ERI3; ++d) {
     taskmgr.add(task_label("3eri", d));
@@ -668,6 +741,46 @@ void try_main(int argc, char* argv[]) {
     cparams->num_bf(task_label("eri", d), 4);
   }
 #endif
+
+#ifdef LIBINT_INCLUDE_RKB_ERI
+  for (unsigned int d = 0; d <= LIBINT_INCLUDE_RKB_ERI; ++d) {
+#if defined(LIBINT_RKB_ERI_MAX_AM_LIST)
+#define BOOST_PP_RKB_ERI_MCR2(r, data, elem)   \
+  cparams->max_am(                             \
+      task_label(BOOST_PP_STRINGIZE(elem), d), \
+                 token<unsigned int>(LIBINT_RKB_ERI_MAX_AM_LIST, ',', d));
+    BOOST_PP_LIST_FOR_EACH(BOOST_PP_RKB_ERI_MCR2, _, BOOST_PP_RKB_ERI_TASK_LIST)
+#undef BOOST_PP_RKB_ERI_MCR2
+#elif defined(LIBINT_RKB_ERI_MAX_AM)
+#define BOOST_PP_RKB_ERI_MCR3(r, data, elem)               \
+  cparams->max_am(task_label(BOOST_PP_STRINGIZE(elem), d), \
+                             LIBINT_RKB_ERI_MAX_AM);
+    BOOST_PP_LIST_FOR_EACH(BOOST_PP_RKB_ERI_MCR3, _, BOOST_PP_RKB_ERI_TASK_LIST)
+#undef BOOST_PP_RKB_ERI_MCR3
+#endif
+#if defined(LIBINT_RKB_ERI_OPT_AM_LIST)
+#define BOOST_PP_RKB_ERI_MCR4(r, data, elem)   \
+  cparams->max_am_opt(                         \
+      task_label(BOOST_PP_STRINGIZE(elem), d), \
+                 token<unsigned int>(LIBINT_RKB_ERI_OPT_AM_LIST, ',', d));
+    BOOST_PP_LIST_FOR_EACH(BOOST_PP_RKB_ERI_MCR4, _, BOOST_PP_RKB_ERI_TASK_LIST)
+#undef BOOST_PP_RKB_ERI_MCR4
+#elif defined(LIBINT_RKB_ERI_OPT_AM)
+#define BOOST_PP_RKB_ERI_MCR5(r, data, elem)                   \
+  cparams->max_am_opt(task_label(BOOST_PP_STRINGIZE(elem), d), \
+                                 LIBINT_RKB_ERI_OPT_AM);
+    BOOST_PP_LIST_FOR_EACH(BOOST_PP_RKB_ERI_MCR5, _, BOOST_PP_RKB_ERI_TASK_LIST)
+#undef BOOST_PP_RKB_ERI_MCR5
+#endif
+  }
+  for (unsigned int d = 0; d <= LIBINT_INCLUDE_RKB_ERI; ++d) {
+#define BOOST_PP_RKB_ERI_MCR6(r, data, elem) \
+  cparams->num_bf(task_label(BOOST_PP_STRINGIZE(elem), d), 4);
+    BOOST_PP_LIST_FOR_EACH(BOOST_PP_RKB_ERI_MCR6, _, BOOST_PP_RKB_ERI_TASK_LIST)
+#undef BOOST_PP_RKB_ERI_MCR6
+  }
+#endif  // LIBINT_INCLUDE_RKB_ERI
+
 #ifdef LIBINT_INCLUDE_ERI3
   for (unsigned int d = 0; d <= LIBINT_INCLUDE_ERI3; ++d) {
 #if defined(LIBINT_ERI3_MAX_AM_LIST)
@@ -852,6 +965,9 @@ void try_main(int argc, char* argv[]) {
 #ifdef LIBINT_INCLUDE_ERI
   max_deriv = std::max(LIBINT_INCLUDE_ERI, max_deriv);
 #endif
+#ifdef LIBINT_INCLUDE_RKB_ERI
+  max_deriv = std::max(LIBINT_INCLUDE_RKB_ERI, max_deriv);
+#endif
 #ifdef LIBINT_INCLUDE_ERI3
   max_deriv = std::max(LIBINT_INCLUDE_ERI3, max_deriv);
 #endif
@@ -867,6 +983,8 @@ void try_main(int argc, char* argv[]) {
 #endif
   cparams->print(os);
 
+  g_progress.start();
+
 #ifdef LIBINT_INCLUDE_ONEBODY
   for (unsigned int d = 0; d <= LIBINT_INCLUDE_ONEBODY; ++d) {
 #define BOOST_PP_ONEBODY_MCR7(r, data, i, elem)                              \
@@ -879,13 +997,25 @@ void try_main(int argc, char* argv[]) {
 #endif
 #ifdef LIBINT_INCLUDE_ERI
 #if !USE_GENERIC_ERI_BUILD
-  build_TwoPRep_2b_2k(os, cparams, iface);
+  build_TwoPRep_2b_2k<TwoPRep>(os, "eri", cparams, iface);
 #else
   for (unsigned int d = 0; d <= LIBINT_INCLUDE_ERI; ++d) {
-    build_TwoPRep_2b_2k(os, cparams, iface, d);
+    build_TwoPRep_2b_2k<TwoPRep>(os, "eri", cparams, iface, d);
   }
 #endif
 #endif
+
+#ifdef LIBINT_INCLUDE_RKB_ERI
+  for (unsigned int d = 0; d <= LIBINT_INCLUDE_RKB_ERI; ++d) {
+#define BOOST_PP_RKB_ERI_MCR7(r, data, i, elem)                              \
+  build_TwoPRep_2b_2k<BOOST_PP_LIST_AT(BOOST_PP_RKB_ERI_TASK_OPER_LIST, i)>( \
+      os, BOOST_PP_STRINGIZE(elem), cparams, iface, d);
+    BOOST_PP_LIST_FOR_EACH_I(BOOST_PP_RKB_ERI_MCR7, _,
+                             BOOST_PP_RKB_ERI_TASK_LIST)
+#undef BOOST_PP_RKB_ERI_MCR7
+  }
+#endif
+
 #ifdef LIBINT_INCLUDE_ERI3
   for (unsigned int d = 0; d <= LIBINT_INCLUDE_ERI3; ++d) {
     build_TwoPRep_1b_2k(os, cparams, iface, d);
@@ -913,6 +1043,8 @@ void try_main(int argc, char* argv[]) {
   build_G12DKH_2b_2k(os, cparams, iface);
 #endif
 
+  g_progress.finish();
+
   // Generate code for the set-level RRs
   std::deque<std::string> decl_filenames, def_filenames;
   generate_rr_code(os, cparams, decl_filenames, def_filenames);
@@ -985,15 +1117,26 @@ void print_config(std::ostream& os) {
 #ifdef LIBINT_INCLUDE_G12DKH
   os << "Will support G12DKH" << endl;
 #endif
+#ifdef LIBINT_INCLUDE_RKB_ERI
+  os << "Will support restricted kinetically balance (RKB) 4-center ERIs "
+     << std::endl;
+  if (LIBINT_INCLUDE_RKB_ERI > 0)
+    os << "(deriv order = " << LIBINT_INCLUDE_RKB_ERI << ")";
+  os << endl;
+#endif
 }
 
-#ifdef LIBINT_INCLUDE_ERI
-void build_TwoPRep_2b_2k(std::ostream& os,
-                         const std::shared_ptr<CompilationParameters>& cparams,
-                         std::shared_ptr<Libint2Iface>& iface,
-                         unsigned int deriv_level) {
-  const std::string task = task_label("eri", deriv_level);
-  typedef TwoPRep_11_11_sq TwoPRep_sh_11_11;
+#if defined(LIBINT_INCLUDE_ERI) || defined(LIBINT_INCLUDE_RKB_ERI)
+template <typename OperType>
+static void build_TwoPRep_2b_2k(
+    std::ostream& os, std::string label,
+    const std::shared_ptr<CompilationParameters>& cparams,
+    std::shared_ptr<Libint2Iface>& iface, unsigned int deriv_level) {
+  typedef GenIntegralSet_11_11<CGShell, OperType, mType> TwoBody_sh_11_11;
+  typedef typename OperType::Descriptor OperDescrType;
+
+  const std::string task = task_label(label, deriv_level);
+
   vector<CGShell*> shells;
   unsigned int lmax = cparams->max_am(task);
   for (unsigned int l = 0; l <= lmax; l++) {
@@ -1005,6 +1148,7 @@ void build_TwoPRep_2b_2k(std::ostream& os,
   taskmgr.current(task);
   iface->to_params(iface->macro_define(std::string("MAX_AM_") + task, lmax));
 
+  const auto nullaux = typename TwoBody_sh_11_11::AuxIndexType(0u);
   //
   // Construct graphs for each desired target integral and
   // 1) generate source code for the found traversal path
@@ -1019,13 +1163,39 @@ void build_TwoPRep_2b_2k(std::ostream& os,
   std::shared_ptr<CodeContext> context(new CppCodeContext(cparams));
   std::shared_ptr<MemoryManager> memman(new WorstFitMemoryManager());
 
+  // σpCoulombσp has a 2-fold bra↔ket-swap symmetry, with per-component sign
+  // flips under the swap (Antisym* flip sign; Scalar/SymTL* invariant) — this
+  // is captured by p1_p2_swappable=true plus a dedicated predicate that
+  // canonicalizes only la+lb<=lc+ld and emits code for *all* within-side
+  // orderings (within-side swap is NOT a symmetry — σ·p attaches to one
+  // specific function per side, and IBP cannot recover the sign across the
+  // 1/r12 coupling).
+  bool p1_p2_swappable = !std::is_same<OperType, CoulombσpσpOper>::value;
+  bool bra_ket_coswappable = std::is_same<OperType, σpσpCoulombσpσpOper>::value;
+
+  // Note: la, lb, lc, ld generate code for chemist notation (ab|O|cd), where O
+  // is a two-body operator.
   for (unsigned int la = 0; la <= lmax; la++) {
     for (unsigned int lb = 0; lb <= lmax; lb++) {
       for (unsigned int lc = 0; lc <= lmax; lc++) {
         for (unsigned int ld = 0; ld <= lmax; ld++) {
-          if (!ShellQuartetSetPredicate<static_cast<ShellSetType>(
-                  LIBINT_SHELL_SET)>::value(la, lb, lc, ld))
-            continue;
+          // σpCoulombσp: only bra↔ket (particle 1↔2) swap is a symmetry.
+          // Within-side swap is NOT (σ·p would move to a different physical
+          // center; IBP cannot repair the sign across 1/r12). Dedicated
+          // predicate canonicalizes la+lb<=lc+ld only (ORCA: >=) and accepts
+          // all within-side orderings.
+          if constexpr (std::is_same<OperType, σpCoulombσpOper>::value) {
+#if LIBINT_SHELL_SET == LIBINT_SHELL_SET_STANDARD
+            if (!(la + lb <= lc + ld)) continue;
+#else
+            if (!(la + lb >= lc + ld)) continue;
+#endif
+          } else {
+            if (!ShellQuartetSetPredicate<static_cast<ShellSetType>(
+                    LIBINT_SHELL_SET)>::value(la, lb, lc, ld, p1_p2_swappable,
+                                              bra_ket_coswappable))
+              continue;
+          }
 
           // std::shared_ptr<Tactic> tactic(new ParticleDirectionTactic(la+lb >
           // lc+ld ? false : true));
@@ -1036,13 +1206,46 @@ void build_TwoPRep_2b_2k(std::ostream& os,
           const int lim = 1;
           if (!(la == lim && lb == lim && lc == lim && ld == lim)) continue;
 #endif
+          // this will hold all target shell sets
+          std::vector<std::shared_ptr<TwoBody_sh_11_11>> targets;
+
+          /////////////////////////////////
+          // loop over operator components
+          /////////////////////////////////
+          std::vector<OperDescrType> descrs(1);
+          if constexpr (std::is_same<OperType, CoulombσpσpOper>::value) {
+            // reset descriptors array
+            descrs.resize(0);
+            // iterate over 4 quaternion components (single spin space)
+            for (int p = 0; p != 4; ++p) {
+              descrs.emplace_back(OperDescrType(p));
+            }
+          }
+          if constexpr (std::is_same<OperType, σpσpCoulombσpσpOper>::value) {
+            // reset descriptors array
+            descrs.resize(0);
+            // iterate over 16 components (tensor product of two spin spaces)
+            for (int p = 0; p != 16; ++p) {
+              descrs.emplace_back(OperDescrType(p));
+            }
+          }
+          if constexpr (std::is_same<OperType, σpCoulombσpOper>::value) {
+            // reset descriptors array
+            descrs.resize(0);
+            // iterate over 9 SO(3) irrep components: 1 scalar trace + 3
+            // antisym (curl-curl) + 5 sym-traceless
+            for (int p = 0; p != 9; ++p) {
+              descrs.emplace_back(OperDescrType(p));
+            }
+          }
 
           // unroll only if max_am <= cparams->max_am_opt(task)
           using std::max;
           const unsigned int max_am = max(max(la, lb), max(lc, ld));
           const bool need_to_optimize = (max_am <= cparams->max_am_opt(task));
+          const auto nopers = descrs.size();
           const bool need_to_unroll =
-              l_to_cgshellsize(la) * l_to_cgshellsize(lb) *
+              nopers * l_to_cgshellsize(la) * l_to_cgshellsize(lb) *
                   l_to_cgshellsize(lc) * l_to_cgshellsize(ld) <=
               cparams->unroll_threshold();
           const unsigned int unroll_threshold =
@@ -1050,10 +1253,14 @@ void build_TwoPRep_2b_2k(std::ostream& os,
                   ? std::numeric_limits<unsigned int>::max()
                   : 0;
           dg_xxxx->registry()->unroll_threshold(unroll_threshold);
-          dg_xxxx->registry()->do_cse(need_to_optimize);
-          dg_xxxx->registry()->condense_expr(condense_expr(
-              cparams->unroll_threshold(), cparams->max_vector_length() > 1));
-          // dg_xxxx->registry()->condense_expr(true);
+          // For multi-component operators (RKB), components share no
+          // intermediates, so CSE/condense_expr is pure overhead — disable.
+          const bool do_optimize = (nopers > 1) ? false : need_to_optimize;
+          dg_xxxx->registry()->do_cse(do_optimize);
+          dg_xxxx->registry()->condense_expr(
+              do_optimize ? condense_expr(cparams->unroll_threshold(),
+                                          cparams->max_vector_length() > 1)
+                          : false);
           //  Need to accumulate integrals?
           dg_xxxx->registry()->accumulate_targets(
               cparams->accumulate_targets());
@@ -1067,7 +1274,7 @@ void build_TwoPRep_2b_2k(std::ostream& os,
           ////////////
           // NB translational invariance is now handled by CR_DerivGauss
           CartesianDerivIterator<4> diter(deriv_level);
-          std::vector<std::shared_ptr<TwoPRep_sh_11_11>> targets;
+
           bool last_deriv = false;
           do {
             CGShell a(la);
@@ -1084,21 +1291,18 @@ void build_TwoPRep_2b_2k(std::ostream& os,
               }
             }
 
-            std::shared_ptr<TwoPRep_sh_11_11> abcd =
-                TwoPRep_sh_11_11::Instance(a, b, c, d, mType(0u));
-            targets.push_back(abcd);
+            // operator component loop
+            for (unsigned int op = 0; op != descrs.size(); ++op) {
+              OperType oper(descrs[op]);
+
+              std::shared_ptr<TwoBody_sh_11_11> abcd =
+                  TwoBody_sh_11_11::Instance(a, b, c, d, nullaux, oper);
+              targets.push_back(abcd);
+            }
+
             last_deriv = diter.last();
             if (!last_deriv) diter.next();
           } while (!last_deriv);
-          // append all derivatives as targets to the graph
-          for (std::vector<std::shared_ptr<TwoPRep_sh_11_11>>::const_iterator
-                   t = targets.begin();
-               t != targets.end(); ++t) {
-            std::shared_ptr<DGVertex> t_ptr =
-                std::dynamic_pointer_cast<DGVertex, TwoPRep_sh_11_11>(*t);
-            dg_xxxx->append_target(t_ptr);
-          }
-
           // make label that characterizes this set of targets
           // use the label of the nondifferentiated integral as a base
           std::string abcd_label;
@@ -1107,33 +1311,52 @@ void build_TwoPRep_2b_2k(std::ostream& os,
             CGShell b(lb);
             CGShell c(lc);
             CGShell d(ld);
-            std::shared_ptr<TwoPRep_sh_11_11> abcd =
-                TwoPRep_sh_11_11::Instance(a, b, c, d, mType(0u));
-            abcd_label = abcd->label();
+
+            if constexpr (std::is_same<OperType, TwoPRep>::value) {
+              OperType oper;
+              oper = OperType(descrs[0]);
+              std::shared_ptr<TwoBody_sh_11_11> abcd =
+                  TwoBody_sh_11_11::Instance(a, b, c, d, nullaux, oper);
+              abcd_label = abcd->label();
+            } else {
+              std::ostringstream oss;
+              oss << "_" << a.label() << "_" << b.label();
+              oss << "_" << label;
+              oss << "_" << c.label() << "_" << d.label();
+              abcd_label = oss.str();
+            }
           }
           // + derivative level (if deriv_level > 0)
-          std::string label;
+          std::string eval_label;
           {
-            label = "";
+            eval_label = "";
             if (deriv_level != 0) {
               std::ostringstream oss;
               oss << "deriv" << deriv_level;
-              label += oss.str();
+              eval_label += oss.str();
             }
-            label += abcd_label;
+            eval_label += abcd_label;
           }
 
-          std::cout << "working on " << label << " ... ";
-          std::cout.flush();
+          g_progress.current_task = eval_label;
+          g_progress.print();
 
           std::string prefix(cparams->source_directory());
           std::deque<std::string> decl_filenames;
           std::deque<std::string> def_filenames;
 
-          // this will generate code for these targets, and potentially generate
-          // code for its prerequisites
+          // append all targets to the graph
+          for (auto it = targets.begin(); it != targets.end(); ++it) {
+            std::shared_ptr<DGVertex> t_ptr =
+                std::dynamic_pointer_cast<DGVertex, TwoBody_sh_11_11>(*it);
+            dg_xxxx->append_target(t_ptr);
+          }
+
+          // this will generate code for these targets, and potentially
+          // generate code for its prerequisites
           GenerateCode(dg_xxxx, context, cparams, strat, tactic, memman,
-                       decl_filenames, def_filenames, prefix, label, false);
+                       decl_filenames, def_filenames, prefix, eval_label,
+                       false);
 
           // update max stack size and # of targets
           const std::shared_ptr<TaskParameters>& tparams =
@@ -1147,16 +1370,14 @@ void build_TwoPRep_2b_2k(std::ostream& os,
           ostringstream oss;
           oss << context->label_to_function_name("libint2_build_" + task) << "["
               << la << "][" << lb << "][" << lc << "][" << ld
-              << "] = " << context->label_to_function_name(label)
+              << "] = " << context->label_to_function_name(eval_label)
               << context->end_of_stat() << endl;
           iface->to_static_init(oss.str());
 
           // need to declare this function internally
-          for (std::deque<std::string>::const_iterator i =
-                   decl_filenames.begin();
-               i != decl_filenames.end(); ++i) {
+          for (auto& decl_filename : decl_filenames) {
             oss.str("");
-            oss << "#include <" << *i << ">" << endl;
+            oss << "#include <" << decl_filename << ">" << endl;
             iface->to_int_iface(oss.str());
           }
 
@@ -1166,7 +1387,8 @@ void build_TwoPRep_2b_2k(std::ostream& os,
           dg_xxxx->reset();
           memman->reset();
 
-          std::cout << "done" << std::endl;
+          ++g_progress.done;
+          g_progress.print();
 
         }  // end of d loop
       }    // end of c loop
@@ -1174,7 +1396,7 @@ void build_TwoPRep_2b_2k(std::ostream& os,
   }        // end of a loop
 }
 
-#endif  // LIBINT_INCLUDE_ERI
+#endif  // LIBINT_INCLUDE_ERI || LIBINT_INCLUDE_RKB_ERI
 
 #ifdef LIBINT_INCLUDE_ERI3
 
@@ -1334,8 +1556,8 @@ void build_TwoPRep_1b_2k(std::ostream& os,
           label += abcd_label;
         }
 
-        std::cout << "working on " << label << " ... ";
-        std::cout.flush();
+        g_progress.current_task = label;
+        g_progress.print();
 
         std::string prefix(cparams->source_directory());
         std::deque<std::string> decl_filenames;
@@ -1363,10 +1585,9 @@ void build_TwoPRep_1b_2k(std::ostream& os,
         iface->to_static_init(oss.str());
 
         // need to declare this function internally
-        for (std::deque<std::string>::const_iterator i = decl_filenames.begin();
-             i != decl_filenames.end(); ++i) {
+        for (auto& decl_filename : decl_filenames) {
           oss.str("");
-          oss << "#include <" << *i << ">" << endl;
+          oss << "#include <" << decl_filename << ">" << endl;
           iface->to_int_iface(oss.str());
         }
 
@@ -1375,7 +1596,8 @@ void build_TwoPRep_1b_2k(std::ostream& os,
 #endif
         dg_xxx->reset();
         memman->reset();
-
+        ++g_progress.done;
+        g_progress.print();
       }  // end of d loop
     }    // end of c loop
   }      // end of bra loop
@@ -1532,15 +1754,16 @@ void build_TwoPRep_1b_1k(std::ostream& os,
         label += abcd_label;
       }
 
-      std::cout << "working on " << label << " ... ";
+      g_progress.current_task = label;
+      g_progress.print();
       std::cout.flush();
 
       std::string prefix(cparams->source_directory());
       std::deque<std::string> decl_filenames;
       std::deque<std::string> def_filenames;
 
-      // this will generate code for this targets, and potentially generate code
-      // for its prerequisites
+      // this will generate code for this targets, and potentially generate
+      // code for its prerequisites
       GenerateCode(dg_xxx, context, cparams, strat, tactic, memman,
                    decl_filenames, def_filenames, prefix, label, false);
 
@@ -1549,7 +1772,8 @@ void build_TwoPRep_1b_1k(std::ostream& os,
           taskmgr.current().params();
       tparams->max_stack_size(max_am, memman->max_memory_used());
       tparams->max_ntarget(targets.size());
-      // os << " Max memory used = " << memman->max_memory_used() << std::endl;
+      // os << " Max memory used = " << memman->max_memory_used() <<
+      // std::endl;
 
       // set pointer to the top-level evaluator function
       ostringstream oss;
@@ -1560,10 +1784,9 @@ void build_TwoPRep_1b_1k(std::ostream& os,
       iface->to_static_init(oss.str());
 
       // need to declare this function internally
-      for (std::deque<std::string>::const_iterator i = decl_filenames.begin();
-           i != decl_filenames.end(); ++i) {
+      for (auto& decl_filename : decl_filenames) {
         oss.str("");
-        oss << "#include <" << *i << ">" << endl;
+        oss << "#include <" << decl_filename << ">" << endl;
         iface->to_int_iface(oss.str());
       }
 
@@ -1572,7 +1795,6 @@ void build_TwoPRep_1b_1k(std::ostream& os,
 #endif
       dg_xxx->reset();
       memman->reset();
-
     }  // end of ket loop
   }    // end of bra loop
 }
@@ -1741,8 +1963,8 @@ void build_R12kG12_2b_2k(std::ostream& os,
           std::deque<std::string> decl_filenames;
           std::deque<std::string> def_filenames;
 
-          // this will generate code for this targets, and potentially generate
-          // code for its prerequisites
+          // this will generate code for this targets, and potentially
+          // generate code for its prerequisites
           GenerateCode(dg_xxxx, context, cparams, strat, tactic, memman,
                        decl_filenames, def_filenames, prefix, label, false);
 
@@ -2094,11 +2316,11 @@ void build_G12DKH_2b_2k(std::ostream& os,
           oss << "#include <" << decl_filename << ">" << endl;
           iface->to_int_iface(oss.str());
 
-          // For the most expensive (i.e. presumably complete) graph extract all
-          // precomputed quantities -- these will be members of the evaluator
-          // structure also extract all RRs -- need to keep track of these to
-          // figure out which external symbols appearing in RR code belong to
-          // this task also
+          // For the most expensive (i.e. presumably complete) graph extract
+          // all precomputed quantities -- these will be members of the
+          // evaluator structure also extract all RRs -- need to keep track of
+          // these to figure out which external symbols appearing in RR code
+          // belong to this task also
           if (la == lmax && lb == lmax && lc == lmax && ld == lmax)
             extract_symbols(dg_xxxx);
 
@@ -2136,6 +2358,12 @@ void config_to_api(const std::shared_ptr<CompilationParameters>& cparams,
   iface->to_params(iface->macro_define("DERIV_ERI_ORDER", LIBINT_INCLUDE_ERI));
   max_deriv_order = std::max(max_deriv_order, LIBINT_INCLUDE_ERI);
 #endif
+#ifdef LIBINT_INCLUDE_RKB_ERI
+  iface->to_params(iface->macro_define("SUPPORT_RKB_ERI", 1));
+  iface->to_params(
+      iface->macro_define("DERIV_RKB_ERI_ORDER", LIBINT_INCLUDE_RKB_ERI));
+  max_deriv_order = std::max(max_deriv_order, LIBINT_INCLUDE_RKB_ERI);
+#endif
 #ifdef LIBINT_INCLUDE_ERI3
   iface->to_params(iface->macro_define("SUPPORT_ERI3", 1));
   iface->to_params(
@@ -2165,7 +2393,8 @@ void config_to_api(const std::shared_ptr<CompilationParameters>& cparams,
   // generated tasks declare all tasks in a range of valid tasks as defined or
   // not
   LibraryTaskManager& taskmgr = LibraryTaskManager::Instance();
-  // the range is defined by max # of centers, max deriv order, and operator set
+  // the range is defined by max # of centers, max deriv order, and operator
+  // set
   const size_t max_ncenter = 4;
   for (unsigned int ncenter = 0; ncenter <= max_ncenter; ++ncenter) {
     std::stringstream oss;
@@ -2191,8 +2420,9 @@ void config_to_api(const std::shared_ptr<CompilationParameters>& cparams,
 
       {  // 2-body ints
 
-#define BOOST_PP_TWOBODY_TASKOPER_TUPLE \
-  ("eri", "r12kg12", "r12_0_g12", "r12_2_g12", "g12_T1_g12", "g12dkh")
+#define BOOST_PP_TWOBODY_TASKOPER_TUPLE                                    \
+  ("eri", "coulomb_opop", "opop_coulomb_opop", "op_coulomb_op", "r12kg12", \
+   "r12_0_g12", "r12_2_g12", "g12_T1_g12", "g12dkh")
 #define BOOST_PP_TWOBODY_TASKOPER_LIST \
   BOOST_PP_TUPLE_TO_LIST(BOOST_PP_TWOBODY_TASKOPER_TUPLE)
 
diff --git a/src/bin/libint/buildtest.h b/src/bin/libint/buildtest.h
index a4923c022..0ea5b571e 100644
--- a/src/bin/libint/buildtest.h
+++ b/src/bin/libint/buildtest.h
@@ -30,13 +30,57 @@
 #include <libint2/config.h>
 #include <strategy.h>
 
+#include <chrono>
 #include <deque>
 #include <fstream>
+#include <iomanip>
 #include <iostream>
 #include <iterator>
 #include <sstream>
 #include <string>
 
+/// Progress tracker for code generation.
+struct CodeGenProgress {
+  unsigned int done = 0;
+  std::string current_task;
+  std::chrono::steady_clock::time_point start_time;
+  bool started = false;
+
+  void start() {
+    start_time = std::chrono::steady_clock::now();
+    started = true;
+  }
+
+  void print() const {
+    if (!started) return;
+    static const char spinner[] = "|/-\\";
+    const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
+                             std::chrono::steady_clock::now() - start_time)
+                             .count();
+    const auto mins = elapsed / 60;
+    const auto secs = elapsed % 60;
+    std::cerr << "\r  " << spinner[done % 4] << "  " << std::setfill('0')
+              << std::setw(2) << mins << ":" << std::setw(2) << secs
+              << std::setfill(' ') << "  [" << done << " functions generated]  "
+              << current_task << "                    " << std::flush;
+  }
+
+  void finish() {
+    if (!started) return;
+    const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
+                             std::chrono::steady_clock::now() - start_time)
+                             .count();
+    const auto mins = elapsed / 60;
+    const auto secs = elapsed % 60;
+    std::cerr << "\r  done  " << std::setfill('0') << std::setw(2) << mins
+              << ":" << std::setw(2) << secs << std::setfill(' ') << "  ["
+              << done << " functions generated]"
+              << "                                        " << std::endl;
+    started = false;
+  }
+};
+extern CodeGenProgress g_progress;
+
 namespace libint2 {
 
 // defined in buildtest.cc
@@ -263,7 +307,6 @@ void GenerateCode(const std::shared_ptr<DirectedGraph>& dg,
   // if there are missing prerequisites -- make a list of them
   PrerequisitesExtractor pe;
   if (dg->missing_prerequisites()) {
-    // std::cout << "missing some prerequisites!" << std::endl;
     dg->foreach (pe);
   }
   std::deque<std::shared_ptr<DGVertex> > prereq_list = pe.vertices;
@@ -296,6 +339,11 @@ void GenerateCode(const std::shared_ptr<DirectedGraph>& dg,
   // extract all external symbols
   extract_symbols(dg);
 
+  // Update progress
+  ++g_progress.done;
+  g_progress.current_task = label;
+  g_progress.print();
+
 #if PRINT_DAG_GRAPHVIZ
   {
     std::basic_ofstream<char> dotfile(dg->label() + ".symb.dot");
diff --git "a/src/bin/libint/comp_11_Coulomb\317\203p\317\203p_11.h" "b/src/bin/libint/comp_11_Coulomb\317\203p\317\203p_11.h"
new file mode 100644
index 000000000..db28cb7ea
--- /dev/null
+++ "b/src/bin/libint/comp_11_Coulomb\317\203p\317\203p_11.h"
@@ -0,0 +1,259 @@
+/*
+ *  Copyright (C) 2004-2026 Edward F. Valeev
+ *
+ *  This file is part of Libint compiler.
+ *
+ *  Libint compiler is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Libint compiler is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with Libint compiler.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef LIBINT_COMP_11_COULOMBΣPΣP_11_H
+#define LIBINT_COMP_11_COULOMBΣPΣP_11_H
+
+#include <dims.h>
+#include <entity.h>
+#include <gaussoper.h>
+#include <generic_rr.h>
+#include <task.h>
+#include <twoprep_11_11.h>
+
+namespace libint2 {
+
+/**
+ * this computes integral of
+ * \f$ \frac{1}{r_{ij}} \sigma \cdot \hat{p}_1 \sigma \cdot \hat{p}_2 \f$ over
+ * CGShell/CGF by rewriting it as a linear combination of integrals over
+ * derivatives of \frac{1}{r_{ij}}
+ * @tparam F basis function type. valid choices are CGShell or CGF
+ */
+template <typename F>
+class CR_11_Coulombσpσp_11
+    : public GenericRecurrenceRelation<
+          CR_11_Coulombσpσp_11<F>, F,
+          GenIntegralSet_11_11<F, CoulombσpσpOper, mType>> {
+ public:
+  typedef CR_11_Coulombσpσp_11<F> ThisType;
+  typedef F BasisFunctionType;
+  typedef CoulombσpσpOper OperType;
+  typedef GenIntegralSet_11_11<F, CoulombσpσpOper, mType> TargetType;
+  typedef GenericRecurrenceRelation<ThisType, BasisFunctionType, TargetType>
+      ParentType;
+  friend class GenericRecurrenceRelation<ThisType, BasisFunctionType,
+                                         TargetType>;
+  static const unsigned int max_nchildren = 100;  // TODO figure out
+
+  using ParentType::Instance;
+
+  static bool directional() { return false; }
+
+ private:
+  using ParentType::is_simple;
+  using ParentType::target_;
+  using ParentType::RecurrenceRelation::expr_;
+  using ParentType::RecurrenceRelation::nflops_;
+
+  /// Constructor is private, used by Instance that maintains
+  /// registry of these objects
+  CR_11_Coulombσpσp_11(const std::shared_ptr<TargetType>&, unsigned int = 0);
+
+  static std::string descr() { return "CR"; }
+
+  // --- Code sharing overrides ---
+  // All shell combos with the same quaternion component share one function.
+
+  std::string generate_label() const override {
+    return "CR_Coulombopop_" +
+           std::to_string(target_->oper()->descr().quaternion_index());
+  }
+
+  std::string spfunction_call(
+      const std::shared_ptr<CodeContext>& context,
+      const std::shared_ptr<ImplicitDimensions>& dims) const override {
+    std::ostringstream os;
+    os << context->label_to_function_name(this->label()) << "(inteval, "
+       << context->value_to_pointer(this->rr_target()->symbol());
+    const unsigned int nc = this->num_children();
+    for (unsigned int c = 0; c < nc; c++) {
+      os << ", " << context->value_to_pointer(this->rr_child(c)->symbol());
+    }
+    // total_dim = product of all shell dims (all 4 shells are spectators)
+    unsigned int total_dim = 1;
+    for (unsigned int p = 0; p < 2; p++) {
+      SubIterator* si = target_->bra().member_subiter(p, 0);
+      total_dim *= si->num_iter();
+      delete si;
+      si = target_->ket().member_subiter(p, 0);
+      total_dim *= si->num_iter();
+      delete si;
+    }
+    os << "," << total_dim;
+    LibraryTaskManager& taskmgr = LibraryTaskManager::Instance();
+    taskmgr.current().params()->max_hrr_hsrank(total_dim);
+    os << ")" << context->end_of_stat() << std::endl;
+    return os.str();
+  }
+
+  std::shared_ptr<ImplicitDimensions> adapt_dims_(
+      const std::shared_ptr<ImplicitDimensions>& dims) const override {
+    auto high_dim = std::make_shared<RTimeEntity<EntityTypes::Int>>("highdim");
+    return std::make_shared<ImplicitDimensions>(high_dim, dims->low(),
+                                                dims->vecdim());
+  }
+
+  /// Hand-emit a simple element-wise loop function.
+  /// comp 0: target = src0 + src1 + src2 (dot product)
+  /// comp 1-3: target = src0 - src1 (cross product components)
+  void generate_code(const std::shared_ptr<CodeContext>& context,
+                     const std::shared_ptr<ImplicitDimensions>& dims,
+                     const std::string& funcname, std::ostream& decl,
+                     std::ostream& def) override {
+    // declare_function lives in dg.cc
+    extern std::string declare_function(
+        const std::shared_ptr<CodeContext>& context,
+        const std::shared_ptr<ImplicitDimensions>& dims,
+        const std::shared_ptr<CodeSymbols>& args, const std::string& tlabel,
+        const std::string& function_descr, std::ostream& decl);
+
+    std::shared_ptr<ImplicitDimensions> localdims = adapt_dims_(dims);
+    // inline assign_symbols_: set symbol names on target/children and
+    // populate CodeSymbols
+    std::shared_ptr<CodeSymbols> symbols(new CodeSymbols);
+    this->rr_target()->set_symbol("target");
+    symbols->append_symbol("target");
+    for (unsigned int c = 0; c < this->num_children(); c++) {
+      std::string symb = "src" + std::to_string(c);
+      this->rr_child(c)->set_symbol(symb);
+      symbols->append_symbol(symb);
+    }
+    LibraryTaskManager& taskmgr = LibraryTaskManager::Instance();
+    const std::string tlabel = taskmgr.current().label();
+    const std::string func_decl =
+        declare_function(context, localdims, symbols, tlabel, funcname, decl);
+    def << context->std_header();
+    def << "#include <" << context->label_to_name(funcname) << ".h>\n\n";
+    def << context->code_prefix();
+    def << func_decl << context->open_block() << std::endl;
+    def << context->std_function_header();
+    const unsigned int nc = this->num_children();
+    def << "#ifdef __INTEL_COMPILER\n#pragma ivdep\n#endif\n";
+    def << "for(int hsi = 0; hsi<highdim; hsi++) {\n";
+    def << "target[hsi] = ";
+    if (nc == 3) {
+      def << "src0[hsi] + src1[hsi] + src2[hsi]";
+    } else if (nc == 2) {
+      def << "src0[hsi] - src1[hsi]";
+    }
+    def << ";\n}\n";
+    unsigned int nflops = (nc > 1) ? nc - 1 : 0;
+    def << "/** Number of flops = " << nflops << " */\n";
+    def << context->close_block() << std::endl;
+    def << context->code_postfix();
+  }
+};
+
+template <typename F>
+CR_11_Coulombσpσp_11<F>::CR_11_Coulombσpσp_11(
+    const std::shared_ptr<TargetType>& Tint, unsigned int)
+    : ParentType(Tint, 0) {
+  assert(Tint->num_func_bra(/* particle */ 0) == 1);
+  assert(Tint->num_func_bra(/* particle */ 1) == 1);
+  assert(Tint->num_func_ket(/* particle */ 0) == 1);
+  assert(Tint->num_func_ket(/* particle */ 1) == 1);
+
+  F a(Tint->bra(0, 0));
+  F b(Tint->ket(0, 0));
+  F c(Tint->bra(1, 0));
+  F d(Tint->ket(1, 0));
+
+  const auto& oper = Tint->oper();
+
+  if (a.contracted() || b.contracted() || c.contracted() || d.contracted())
+    return;
+
+  using namespace libint2::algebra;
+  using namespace libint2::prefactor;
+  using libint2::algebra::operator*;
+
+  const mType zero_m(0u);
+
+  ChildFactory<ThisType,
+               GenIntegralSet_11_11<BasisFunctionType, TwoPRep, mType>>
+      factory(this);
+
+  constexpr auto x = 0;
+  constexpr auto y = 1;
+  constexpr auto z = 2;
+
+  F c_x{c};
+  c_x.deriv().inc(x);  // d(c)/dx = c_x
+  F c_y{c};
+  c_y.deriv().inc(y);  // d(c)/dy = c_y
+  F c_z{c};
+  c_z.deriv().inc(z);  // d(c)/dz = c_z
+
+  F d_x{d};
+  d_x.deriv().inc(x);  // d(d)/dx = d_x
+  F d_y{d};
+  d_y.deriv().inc(y);  // d(d)/dy = d_y
+  F d_z{d};
+  d_z.deriv().inc(z);  // d(d)/dz = d_z
+
+  // Component wise generation for quaternion ( a b | 1/r12 | (σ.p) c (σ.p) d )
+  switch (oper->descr().quaternion_index()) {
+    case 0: {
+      // zeroth component = (a b | c_x d_x) + (a b | c_y d_y) + (a b | c_z d_z)
+      auto a_b_cx_dx = factory.make_child(a, b, c_x, d_x, zero_m);
+      auto a_b_cy_dy = factory.make_child(a, b, c_y, d_y, zero_m);
+      auto a_b_cz_dz = factory.make_child(a, b, c_z, d_z, zero_m);
+      if (is_simple()) {
+        expr_ = a_b_cx_dx + a_b_cy_dy + a_b_cz_dz;
+        nflops_ += 2;
+      }
+    } break;
+    case 1: {
+      // x component = (a b | c_y d_z) - (a b | c_z d_y)
+      auto a_b_cy_dz = factory.make_child(a, b, c_y, d_z, zero_m);
+      auto a_b_cz_dy = factory.make_child(a, b, c_z, d_y, zero_m);
+      if (is_simple()) {
+        expr_ = a_b_cy_dz - a_b_cz_dy;
+        nflops_ += 1;
+      }
+    } break;
+    case 2: {
+      // y component = (a b | c_z d_x) - (a b | c_x d_z)
+      auto a_b_cz_dx = factory.make_child(a, b, c_z, d_x, zero_m);
+      auto a_b_cx_dz = factory.make_child(a, b, c_x, d_z, zero_m);
+      if (is_simple()) {
+        expr_ = a_b_cz_dx - a_b_cx_dz;
+        nflops_ += 1;
+      }
+    } break;
+    case 3: {
+      // z component = (a b | c_x d_y) - (a b | c_y d_x)
+      auto a_b_cx_dy = factory.make_child(a, b, c_x, d_y, zero_m);
+      auto a_b_cy_dx = factory.make_child(a, b, c_y, d_x, zero_m);
+      if (is_simple()) {
+        expr_ = a_b_cx_dy - a_b_cy_dx;
+        nflops_ += 1;
+      }
+    } break;
+    default:
+      throw std::runtime_error(
+          "CR_11_Coulombσpσp_11: invalid quaternionic index");
+  }
+
+}  // CR_11_Coulombσpσp_11<F>::CR_11_Coulombσpσp_11
+};  // namespace libint2
+
+#endif  // LIBINT_COMP_11_COULOMBΣPΣP_11_H
diff --git "a/src/bin/libint/comp_11_\317\203pCoulomb\317\203p_11.h" "b/src/bin/libint/comp_11_\317\203pCoulomb\317\203p_11.h"
new file mode 100644
index 000000000..7391d3d31
--- /dev/null
+++ "b/src/bin/libint/comp_11_\317\203pCoulomb\317\203p_11.h"
@@ -0,0 +1,317 @@
+/*
+ *  Copyright (C) 2004-2026 Edward F. Valeev
+ *
+ *  This file is part of Libint compiler.
+ *
+ *  Libint compiler is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Libint compiler is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with Libint compiler.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef LIBINT_COMP_11_ΣPCOULOMBΣP_11_H
+#define LIBINT_COMP_11_ΣPCOULOMBΣP_11_H
+
+#include <dims.h>
+#include <entity.h>
+#include <gaussoper.h>
+#include <generic_rr.h>
+#include <task.h>
+#include <twoprep_11_11.h>
+
+namespace libint2 {
+
+/**
+ * Computes the "Gaunt LS bilinear" integral
+ *   \f$ (\mu\, \sigma\cdot\hat{p}\,\nu | 1/r_{12} | \kappa\,
+ * \sigma\cdot\hat{p}\,\lambda ) \f$ in the SO(3) irreducible decomposition of
+ * the rank-2 tensor \f$ T_{ab} = ( \mu \cdot \partial_a \nu | 1/r_{12} | \kappa
+ * \cdot \partial_b \lambda ) \f$: 1 scalar trace + 3 antisymmetric (curl-curl)
+ * + 5 symmetric-traceless = 9 components total. Each output is a small linear
+ * combination of raw deriv-TwoPRep children, mirroring
+ * comp_11_Coulombσpσp_11.h's pattern of trace/antisym emission.
+ *
+ * @tparam F basis function type. valid choices are CGShell or CGF
+ */
+template <typename F>
+class CR_11_σpCoulombσp_11
+    : public GenericRecurrenceRelation<
+          CR_11_σpCoulombσp_11<F>, F,
+          GenIntegralSet_11_11<F, σpCoulombσpOper, mType>> {
+ public:
+  typedef CR_11_σpCoulombσp_11<F> ThisType;
+  typedef F BasisFunctionType;
+  typedef σpCoulombσpOper OperType;
+  typedef GenIntegralSet_11_11<F, σpCoulombσpOper, mType> TargetType;
+  typedef GenericRecurrenceRelation<ThisType, BasisFunctionType, TargetType>
+      ParentType;
+  friend class GenericRecurrenceRelation<ThisType, BasisFunctionType,
+                                         TargetType>;
+  static const unsigned int max_nchildren = 3;
+
+  using ParentType::Instance;
+
+  static bool directional() { return false; }
+
+ private:
+  using ParentType::is_simple;
+  using ParentType::target_;
+  using ParentType::RecurrenceRelation::expr_;
+  using ParentType::RecurrenceRelation::nflops_;
+
+  /// Constructor is private, used by Instance that maintains
+  /// registry of these objects
+  CR_11_σpCoulombσp_11(const std::shared_ptr<TargetType>&, unsigned int = 0);
+
+  static std::string descr() { return "CR"; }
+
+  // --- Code sharing overrides (mirror Coulombσpσp pattern) ---
+  // All shell quartets with the same quaternion component share one function.
+
+  std::string generate_label() const override {
+    return "CR_σpCoulombσp_" +
+           std::to_string(target_->oper()->descr().component_index());
+  }
+
+  std::string spfunction_call(
+      const std::shared_ptr<CodeContext>& context,
+      const std::shared_ptr<ImplicitDimensions>& dims) const override {
+    std::ostringstream os;
+    os << context->label_to_function_name(this->label()) << "(inteval, "
+       << context->value_to_pointer(this->rr_target()->symbol());
+    const unsigned int nc = this->num_children();
+    for (unsigned int c = 0; c < nc; c++) {
+      os << ", " << context->value_to_pointer(this->rr_child(c)->symbol());
+    }
+    // total_dim = product of all shell dims (all 4 shells are spectators)
+    unsigned int total_dim = 1;
+    for (unsigned int p = 0; p < 2; p++) {
+      SubIterator* si = target_->bra().member_subiter(p, 0);
+      total_dim *= si->num_iter();
+      delete si;
+      si = target_->ket().member_subiter(p, 0);
+      total_dim *= si->num_iter();
+      delete si;
+    }
+    os << "," << total_dim;
+    LibraryTaskManager& taskmgr = LibraryTaskManager::Instance();
+    taskmgr.current().params()->max_hrr_hsrank(total_dim);
+    os << ")" << context->end_of_stat() << std::endl;
+    return os.str();
+  }
+
+  std::shared_ptr<ImplicitDimensions> adapt_dims_(
+      const std::shared_ptr<ImplicitDimensions>& dims) const override {
+    auto high_dim = std::make_shared<RTimeEntity<EntityTypes::Int>>("highdim");
+    return std::make_shared<ImplicitDimensions>(high_dim, dims->low(),
+                                                dims->vecdim());
+  }
+
+  /// Hand-emit the per-component irrep linear combination over deriv-ERI
+  /// children. The combination depends on the target's component index.
+  void generate_code(const std::shared_ptr<CodeContext>& context,
+                     const std::shared_ptr<ImplicitDimensions>& dims,
+                     const std::string& funcname, std::ostream& decl,
+                     std::ostream& def) override {
+    extern std::string declare_function(
+        const std::shared_ptr<CodeContext>& context,
+        const std::shared_ptr<ImplicitDimensions>& dims,
+        const std::shared_ptr<CodeSymbols>& args, const std::string& tlabel,
+        const std::string& function_descr, std::ostream& decl);
+
+    std::shared_ptr<ImplicitDimensions> localdims = adapt_dims_(dims);
+    std::shared_ptr<CodeSymbols> symbols(new CodeSymbols);
+    this->rr_target()->set_symbol("target");
+    symbols->append_symbol("target");
+    for (unsigned int c = 0; c < this->num_children(); c++) {
+      std::string symb = "src" + std::to_string(c);
+      this->rr_child(c)->set_symbol(symb);
+      symbols->append_symbol(symb);
+    }
+    LibraryTaskManager& taskmgr = LibraryTaskManager::Instance();
+    const std::string tlabel = taskmgr.current().label();
+    const std::string func_decl =
+        declare_function(context, localdims, symbols, tlabel, funcname, decl);
+    def << context->std_header();
+    def << "#include <" << context->label_to_name(funcname) << ".h>\n\n";
+    def << context->code_prefix();
+    def << func_decl << context->open_block() << std::endl;
+    def << context->std_function_header();
+    def << "#ifdef __INTEL_COMPILER\n#pragma ivdep\n#endif\n";
+    def << "for(int hsi = 0; hsi<highdim; hsi++) {\n";
+
+    const int comp = this->target_->oper()->descr().component_index();
+    std::string rhs;
+    unsigned int nflops = 0;
+    switch (comp) {
+      case σpCoulombσp_Descr::Scalar:
+        rhs = "src0[hsi] + src1[hsi] + src2[hsi]";
+        nflops = 2;
+        break;
+      case σpCoulombσp_Descr::AntisymX:
+      case σpCoulombσp_Descr::AntisymY:
+      case σpCoulombσp_Descr::AntisymZ:
+      case σpCoulombσp_Descr::SymTLDiagA:
+        rhs = "src0[hsi] - src1[hsi]";
+        nflops = 1;
+        break;
+      case σpCoulombσp_Descr::SymTLDiagB:
+        rhs = "2.0*src0[hsi] - src1[hsi] - src2[hsi]";
+        nflops = 3;
+        break;
+      case σpCoulombσp_Descr::SymTLOffXY:
+      case σpCoulombσp_Descr::SymTLOffXZ:
+      case σpCoulombσp_Descr::SymTLOffYZ:
+        rhs = "src0[hsi] + src1[hsi]";
+        nflops = 1;
+        break;
+      default:
+        throw std::runtime_error(
+            "CR_11_σpCoulombσp_11::generate_code: invalid component index");
+    }
+    def << "target[hsi] = " << rhs << ";\n}\n";
+    def << "/** Number of flops = " << nflops << " */\n";
+    def << context->close_block() << std::endl;
+    def << context->code_postfix();
+  }
+};
+
+template <typename F>
+CR_11_σpCoulombσp_11<F>::CR_11_σpCoulombσp_11(
+    const std::shared_ptr<TargetType>& Tint, unsigned int)
+    : ParentType(Tint, 0) {
+  assert(Tint->num_func_bra(/* particle */ 0) == 1);
+  assert(Tint->num_func_bra(/* particle */ 1) == 1);
+  assert(Tint->num_func_ket(/* particle */ 0) == 1);
+  assert(Tint->num_func_ket(/* particle */ 1) == 1);
+
+  F a(Tint->bra(0, 0));
+  F b(Tint->ket(0, 0));
+  F c(Tint->bra(1, 0));
+  F d(Tint->ket(1, 0));
+
+  const auto& oper = Tint->oper();
+
+  if (a.contracted() || b.contracted() || c.contracted() || d.contracted())
+    return;
+
+  using namespace libint2::algebra;
+  using namespace libint2::prefactor;
+  using libint2::algebra::operator*;
+
+  const mType zero_m(0u);
+
+  ChildFactory<ThisType,
+               GenIntegralSet_11_11<BasisFunctionType, TwoPRep, mType>>
+      factory(this);
+
+  // Chemist notation: (a b | op c op d). σ·p acts on one function per electron
+  // — direction `i` on ket(0,0) = b (electron 1), direction `j` on ket(1,0) = d
+  // (electron 2). Each output is an SO(3) irrep combination of the raw 3×3
+  // T_{ij} dyadic; case bodies build only the children each combination needs.
+  constexpr auto x = 0;
+  constexpr auto y = 1;
+  constexpr auto z = 2;
+
+  auto T = [&](int i, int j) {
+    F b_d{b};
+    b_d.deriv().inc(i);
+    F d_d{d};
+    d_d.deriv().inc(j);
+    return factory.make_child(a, b_d, c, d_d, zero_m);
+  };
+
+  switch (oper->descr().component_index()) {
+    case σpCoulombσp_Descr::Scalar: {
+      auto Txx = T(x, x);
+      auto Tyy = T(y, y);
+      auto Tzz = T(z, z);
+      if (is_simple()) {
+        expr_ = Txx + Tyy + Tzz;
+        nflops_ += 2;
+      }
+    } break;
+    case σpCoulombσp_Descr::AntisymX: {
+      auto Tyz = T(y, z);
+      auto Tzy = T(z, y);
+      if (is_simple()) {
+        expr_ = Tyz - Tzy;
+        nflops_ += 1;
+      }
+    } break;
+    case σpCoulombσp_Descr::AntisymY: {
+      auto Tzx = T(z, x);
+      auto Txz = T(x, z);
+      if (is_simple()) {
+        expr_ = Tzx - Txz;
+        nflops_ += 1;
+      }
+    } break;
+    case σpCoulombσp_Descr::AntisymZ: {
+      auto Txy = T(x, y);
+      auto Tyx = T(y, x);
+      if (is_simple()) {
+        expr_ = Txy - Tyx;
+        nflops_ += 1;
+      }
+    } break;
+    case σpCoulombσp_Descr::SymTLDiagA: {
+      auto Txx = T(x, x);
+      auto Tyy = T(y, y);
+      if (is_simple()) {
+        expr_ = Txx - Tyy;
+        nflops_ += 1;
+      }
+    } break;
+    case σpCoulombσp_Descr::SymTLDiagB: {
+      // 2·T_zz − T_xx − T_yy: child order (Tzz, Txx, Tyy) matches generate_code
+      auto Tzz = T(z, z);
+      auto Txx = T(x, x);
+      auto Tyy = T(y, y);
+      if (is_simple()) {
+        expr_ = Scalar(2) * Tzz - Txx - Tyy;
+        nflops_ += 3;
+      }
+    } break;
+    case σpCoulombσp_Descr::SymTLOffXY: {
+      auto Txy = T(x, y);
+      auto Tyx = T(y, x);
+      if (is_simple()) {
+        expr_ = Txy + Tyx;
+        nflops_ += 1;
+      }
+    } break;
+    case σpCoulombσp_Descr::SymTLOffXZ: {
+      auto Txz = T(x, z);
+      auto Tzx = T(z, x);
+      if (is_simple()) {
+        expr_ = Txz + Tzx;
+        nflops_ += 1;
+      }
+    } break;
+    case σpCoulombσp_Descr::SymTLOffYZ: {
+      auto Tyz = T(y, z);
+      auto Tzy = T(z, y);
+      if (is_simple()) {
+        expr_ = Tyz + Tzy;
+        nflops_ += 1;
+      }
+    } break;
+    default:
+      throw std::runtime_error("CR_11_σpCoulombσp_11: invalid component index");
+  }
+
+}  // CR_11_σpCoulombσp_11<F>::CR_11_σpCoulombσp_11
+
+}  // namespace libint2
+
+#endif  // LIBINT_COMP_11_ΣPCOULOMBΣP_11_H
diff --git "a/src/bin/libint/comp_11_\317\203p\317\203pCoulomb\317\203p\317\203p_11.h" "b/src/bin/libint/comp_11_\317\203p\317\203pCoulomb\317\203p\317\203p_11.h"
new file mode 100644
index 000000000..64d1fdee6
--- /dev/null
+++ "b/src/bin/libint/comp_11_\317\203p\317\203pCoulomb\317\203p\317\203p_11.h"
@@ -0,0 +1,471 @@
+/*
+ *  Copyright (C) 2004-2026 Edward F. Valeev
+ *
+ *  This file is part of Libint compiler.
+ *
+ *  Libint compiler is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Libint compiler is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with Libint compiler.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef LIBINT_COMP_11_ΣPΣPCOULOMBΣPΣP_11_H
+#define LIBINT_COMP_11_ΣPΣPCOULOMBΣPΣP_11_H
+
+#include <dims.h>
+#include <entity.h>
+#include <gaussoper.h>
+#include <generic_rr.h>
+#include <task.h>
+#include <twoprep_11_11.h>
+
+namespace libint2 {
+
+/**
+ * Computes integral of
+ * \f$ (\sigma_1 \cdot \hat{p}_a)(\sigma_1 \cdot \hat{p}_b)
+ *     \frac{1}{r_{12}}
+ *     (\sigma_2 \cdot \hat{p}_c)(\sigma_2 \cdot \hat{p}_d) \f$
+ * over CGShell/CGF by rewriting it as a linear combination of integrals
+ * over derivatives of \f$ \frac{1}{r_{12}} \f$.
+ *
+ * The two sigma operators act on independent spin spaces (electron 1 and
+ * electron 2). Using the Dirac identity (see e.g. Eq. 1.27 of I. P. Grant,
+ * "Relativistic Quantum Theory of Atoms and Molecules", Springer, 2007):
+ *   \f$ (\sigma \cdot a)(\sigma \cdot b) = (a \cdot b)I
+ *       + i\sigma \cdot (a \times b) \f$
+ * applied independently to each particle's spin space gives a tensor product
+ * of two quaternions with \f$ 4 \times 4 = 16 \f$ components:
+ *
+ *   index = 4 * bra_spin_index + ket_spin_index
+ *
+ * where spin indices are: 0=S (scalar/dot product), 1=X, 2=Y, 3=Z
+ * (cross product components).
+ *
+ * The 16 components map to:
+ *   T1 (index 0):      SS = (a.b)(c.d)                  [scalar x scalar]
+ *   T2 (indices 1-3):  SX,SY,SZ = (a.b)(cxd)_{x,y,z}   [scalar x spin]
+ *   T3 (indices 4-6):  XS,YS,ZS = (axb)_{x,y,z}(c.d)   [spin x scalar]
+ *   T4 (indices 7-15): XX..ZZ = -(axb)_i(cxd)_j         [spin x spin]
+ *
+ * Sign convention: T4 components include the minus sign from \f$ i^2 = -1 \f$,
+ * arising from the product of two \f$ i \f$ factors in the Dirac identity:
+ *   \f$ [i\sigma_1 \cdot (a \times b)] \otimes [i\sigma_2 \cdot (c \times d)]
+ *     = -\sigma_{1,i} \otimes \sigma_{2,j}\; (a \times b)_i\, (c \times d)_j
+ *   \f$
+ *
+ * @tparam F basis function type. valid choices are CGShell or CGF
+ */
+template <typename F>
+class CR_11_σpσpCoulombσpσp_11
+    : public GenericRecurrenceRelation<
+          CR_11_σpσpCoulombσpσp_11<F>, F,
+          GenIntegralSet_11_11<F, σpσpCoulombσpσpOper, mType>> {
+ public:
+  typedef CR_11_σpσpCoulombσpσp_11<F> ThisType;
+  typedef F BasisFunctionType;
+  typedef σpσpCoulombσpσpOper OperType;
+  typedef GenIntegralSet_11_11<F, σpσpCoulombσpσpOper, mType> TargetType;
+  typedef GenericRecurrenceRelation<ThisType, BasisFunctionType, TargetType>
+      ParentType;
+  friend class GenericRecurrenceRelation<ThisType, BasisFunctionType,
+                                         TargetType>;
+  static const unsigned int max_nchildren = 100;  // TODO figure out
+
+  using ParentType::Instance;
+
+  static bool directional() { return false; }
+
+ private:
+  using ParentType::is_simple;
+  using ParentType::target_;
+  using ParentType::RecurrenceRelation::expr_;
+  using ParentType::RecurrenceRelation::nflops_;
+
+  /// Constructor is private, used by Instance that maintains
+  /// registry of these objects
+  CR_11_σpσpCoulombσpσp_11(const std::shared_ptr<TargetType>&,
+                           unsigned int = 0);
+
+  static std::string descr() { return "CR"; }
+
+  // --- Code sharing overrides ---
+  // All shell combos with the same quaternion component share one function.
+
+  std::string generate_label() const override {
+    return "CR_opopCoulombopop_" +
+           std::to_string(target_->oper()->descr().quaternion_index());
+  }
+
+  std::string spfunction_call(
+      const std::shared_ptr<CodeContext>& context,
+      const std::shared_ptr<ImplicitDimensions>& dims) const override {
+    std::ostringstream os;
+    os << context->label_to_function_name(this->label()) << "(inteval, "
+       << context->value_to_pointer(this->rr_target()->symbol());
+    const unsigned int nc = this->num_children();
+    for (unsigned int c = 0; c < nc; c++) {
+      os << ", " << context->value_to_pointer(this->rr_child(c)->symbol());
+    }
+    // total_dim = product of all shell dims (all 4 shells are spectators)
+    unsigned int total_dim = 1;
+    for (unsigned int p = 0; p < 2; p++) {
+      SubIterator* si = target_->bra().member_subiter(p, 0);
+      total_dim *= si->num_iter();
+      delete si;
+      si = target_->ket().member_subiter(p, 0);
+      total_dim *= si->num_iter();
+      delete si;
+    }
+    os << "," << total_dim;
+    LibraryTaskManager& taskmgr = LibraryTaskManager::Instance();
+    taskmgr.current().params()->max_hrr_hsrank(total_dim);
+    os << ")" << context->end_of_stat() << std::endl;
+    return os.str();
+  }
+
+  std::shared_ptr<ImplicitDimensions> adapt_dims_(
+      const std::shared_ptr<ImplicitDimensions>& dims) const override {
+    auto high_dim = std::make_shared<RTimeEntity<EntityTypes::Int>>("highdim");
+    return std::make_shared<ImplicitDimensions>(high_dim, dims->low(),
+                                                dims->vecdim());
+  }
+
+  /// Hand-emit a simple element-wise loop function.
+  /// Cannot use S-shell dummy because TwoPRep particle-swap canonicalization
+  /// deduplicates children (e.g., (S_x S_x|S_y S_y) = (S_y S_y|S_x S_x)),
+  /// giving fewer children than the real instance.
+  void generate_code(const std::shared_ptr<CodeContext>& context,
+                     const std::shared_ptr<ImplicitDimensions>& dims,
+                     const std::string& funcname, std::ostream& decl,
+                     std::ostream& def) override {
+    // declare_function lives in dg.cc
+    extern std::string declare_function(
+        const std::shared_ptr<CodeContext>& context,
+        const std::shared_ptr<ImplicitDimensions>& dims,
+        const std::shared_ptr<CodeSymbols>& args, const std::string& tlabel,
+        const std::string& function_descr, std::ostream& decl);
+
+    std::shared_ptr<ImplicitDimensions> localdims = adapt_dims_(dims);
+    // inline assign_symbols_: set symbol names on target/children and
+    // populate CodeSymbols
+    std::shared_ptr<CodeSymbols> symbols(new CodeSymbols);
+    this->rr_target()->set_symbol("target");
+    symbols->append_symbol("target");
+    for (unsigned int c = 0; c < this->num_children(); c++) {
+      std::string symb = "src" + std::to_string(c);
+      this->rr_child(c)->set_symbol(symb);
+      symbols->append_symbol(symb);
+    }
+    LibraryTaskManager& taskmgr = LibraryTaskManager::Instance();
+    const std::string tlabel = taskmgr.current().label();
+    const std::string func_decl =
+        declare_function(context, localdims, symbols, tlabel, funcname, decl);
+    def << context->std_header();
+    def << "#include <" << context->label_to_name(funcname) << ".h>\n\n";
+    def << context->code_prefix();
+    def << func_decl << context->open_block() << std::endl;
+    def << context->std_function_header();
+    // Sign patterns for each component, indexed by child order in constructor.
+    // comp 0 (SS): 9 children, all +1
+    // comp 1-3 (SX,SY,SZ): 6 children, alternating +1,-1
+    // comp 4,8,12 (XS,YS,ZS): 6 children, alternating +1,-1
+    // comp 5-7,9-11,13-15 (XX..ZZ): 4 children, pattern -1,+1,+1,-1
+    const unsigned int nc = this->num_children();
+    def << "#ifdef __INTEL_COMPILER\n#pragma ivdep\n#endif\n";
+    def << "for(int hsi = 0; hsi<highdim; hsi++) {\n";
+    def << "target[hsi] = ";
+    if (nc == 9) {
+      // SS component: all positive
+      for (unsigned int c = 0; c < 9; c++) {
+        if (c > 0) def << " + ";
+        def << "src" << c << "[hsi]";
+      }
+    } else if (nc == 6) {
+      // SX,SY,SZ,XS,YS,ZS: alternating +/-
+      for (unsigned int c = 0; c < 6; c++) {
+        def << ((c % 2 == 0) ? " + " : " - ");
+        def << "src" << c << "[hsi]";
+      }
+    } else if (nc == 4) {
+      // XX,XY,...,ZZ: -src0 + src1 + src2 - src3
+      def << "- src0[hsi] + src1[hsi] + src2[hsi] - src3[hsi]";
+    }
+    def << ";\n}\n";
+    unsigned int nflops = (nc > 1) ? nc - 1 : 0;
+    def << "/** Number of flops = " << nflops << " */\n";
+    def << context->close_block() << std::endl;
+    def << context->code_postfix();
+  }
+};
+
+template <typename F>
+CR_11_σpσpCoulombσpσp_11<F>::CR_11_σpσpCoulombσpσp_11(
+    const std::shared_ptr<TargetType>& Tint, unsigned int)
+    : ParentType(Tint, 0) {
+  assert(Tint->num_func_bra(/* particle */ 0) == 1);
+  assert(Tint->num_func_bra(/* particle */ 1) == 1);
+  assert(Tint->num_func_ket(/* particle */ 0) == 1);
+  assert(Tint->num_func_ket(/* particle */ 1) == 1);
+
+  F a(Tint->bra(0, 0));
+  F b(Tint->ket(0, 0));
+  F c(Tint->bra(1, 0));
+  F d(Tint->ket(1, 0));
+
+  const auto& oper = Tint->oper();
+
+  if (a.contracted() || b.contracted() || c.contracted() || d.contracted())
+    return;
+
+  using namespace libint2::algebra;
+  using namespace libint2::prefactor;
+  using libint2::algebra::operator*;
+
+  const mType zero_m(0u);
+
+  ChildFactory<ThisType,
+               GenIntegralSet_11_11<BasisFunctionType, TwoPRep, mType>>
+      factory(this);
+
+  constexpr auto x = 0;
+  constexpr auto y = 1;
+  constexpr auto z = 2;
+
+  auto mc = [&](const int r1, const int r2, const int r3, const int r4) {
+    F a_r1{a};
+    a_r1.deriv().inc(r1);
+    F b_r2{b};
+    b_r2.deriv().inc(r2);
+    F c_r3{c};
+    c_r3.deriv().inc(r3);
+    F d_r4{d};
+    d_r4.deriv().inc(r4);
+    return factory.make_child(a_r1, b_r2, c_r3, d_r4, zero_m);
+  };
+
+  // 16-component generation for two independent spin spaces:
+  // ( (σ₁.p) a (σ₁.p) b | 1/r12 | (σ₂.p) c (σ₂.p) d )
+  //
+  // Option A (tensor product) ordering: index = 4 * bra_spin + ket_spin
+  //   bra_spin = index / 4,  ket_spin = index % 4
+  //   spin indices: S=0, X=1, Y=2, Z=3
+  //
+  // Row bra=S: 0=SS, 1=SX, 2=SY, 3=SZ          (T1 + T2)
+  // Row bra=X: 4=XS, 5=XX, 6=XY, 7=XZ          (T3 + T4)
+  // Row bra=Y: 8=YS, 9=YX, 10=YY, 11=YZ        (T3 + T4)
+  // Row bra=Z: 12=ZS, 13=ZX, 14=ZY, 15=ZZ      (T3 + T4)
+  //
+  // T4 components include minus sign from i^2 = -1.
+  switch (oper->descr().quaternion_index()) {
+    // ===== 0: SS = (a.b)(c.d) =====
+    case 0: {
+      auto xxxx = mc(x, x, x, x);
+      auto xxyy = mc(x, x, y, y);
+      auto xxzz = mc(x, x, z, z);
+      auto yyxx = mc(y, y, x, x);
+      auto yyyy = mc(y, y, y, y);
+      auto yyzz = mc(y, y, z, z);
+      auto zzxx = mc(z, z, x, x);
+      auto zzyy = mc(z, z, y, y);
+      auto zzzz = mc(z, z, z, z);
+      if (is_simple()) {
+        expr_ = xxxx + xxyy + xxzz + yyxx + yyyy + yyzz + zzxx + zzyy + zzzz;
+        nflops_ += 8;
+      }
+    } break;
+    // ===== 1: SX = (a.b)(c×d)_x =====
+    case 1: {
+      auto xxyz = mc(x, x, y, z);
+      auto xxzy = mc(x, x, z, y);
+      auto yyyz = mc(y, y, y, z);
+      auto yyzy = mc(y, y, z, y);
+      auto zzyz = mc(z, z, y, z);
+      auto zzzy = mc(z, z, z, y);
+      if (is_simple()) {
+        expr_ = xxyz - xxzy + yyyz - yyzy + zzyz - zzzy;
+        nflops_ += 5;
+      }
+    } break;
+    // ===== 2: SY = (a.b)(c×d)_y =====
+    case 2: {
+      auto xxzx = mc(x, x, z, x);
+      auto xxxz = mc(x, x, x, z);
+      auto yyzx = mc(y, y, z, x);
+      auto yyxz = mc(y, y, x, z);
+      auto zzzx = mc(z, z, z, x);
+      auto zzxz = mc(z, z, x, z);
+      if (is_simple()) {
+        expr_ = xxzx - xxxz + yyzx - yyxz + zzzx - zzxz;
+        nflops_ += 5;
+      }
+    } break;
+    // ===== 3: SZ = (a.b)(c×d)_z =====
+    case 3: {
+      auto xxxy = mc(x, x, x, y);
+      auto xxyx = mc(x, x, y, x);
+      auto yyxy = mc(y, y, x, y);
+      auto yyyx = mc(y, y, y, x);
+      auto zzxy = mc(z, z, x, y);
+      auto zzyx = mc(z, z, y, x);
+      if (is_simple()) {
+        expr_ = xxxy - xxyx + yyxy - yyyx + zzxy - zzyx;
+        nflops_ += 5;
+      }
+    } break;
+    // ===== 4: XS = (a×b)_x(c.d) =====
+    case 4: {
+      auto yzxx = mc(y, z, x, x);
+      auto zyxx = mc(z, y, x, x);
+      auto yzyy = mc(y, z, y, y);
+      auto zyyy = mc(z, y, y, y);
+      auto yzzz = mc(y, z, z, z);
+      auto zyzz = mc(z, y, z, z);
+      if (is_simple()) {
+        expr_ = yzxx - zyxx + yzyy - zyyy + yzzz - zyzz;
+        nflops_ += 5;
+      }
+    } break;
+    // ===== 5: XX = -(a×b)_x(c×d)_x (minus from i²=-1) =====
+    case 5: {
+      auto yzyz = mc(y, z, y, z);
+      auto yzzy = mc(y, z, z, y);
+      auto zyyz = mc(z, y, y, z);
+      auto zyzy = mc(z, y, z, y);
+      if (is_simple()) {
+        expr_ = yzzy - yzyz + zyyz - zyzy;
+        nflops_ += 3;
+      }
+    } break;
+    // ===== 6: XY = -(a×b)_x(c×d)_y =====
+    case 6: {
+      auto yzzx = mc(y, z, z, x);
+      auto yzxz = mc(y, z, x, z);
+      auto zyzx = mc(z, y, z, x);
+      auto zyxz = mc(z, y, x, z);
+      if (is_simple()) {
+        expr_ = yzxz - yzzx + zyzx - zyxz;
+        nflops_ += 3;
+      }
+    } break;
+    // ===== 7: XZ = -(a×b)_x(c×d)_z =====
+    case 7: {
+      auto yzxy = mc(y, z, x, y);
+      auto yzyx = mc(y, z, y, x);
+      auto zyxy = mc(z, y, x, y);
+      auto zyyx = mc(z, y, y, x);
+      if (is_simple()) {
+        expr_ = yzyx - yzxy + zyxy - zyyx;
+        nflops_ += 3;
+      }
+    } break;
+    // ===== 8: YS = (a×b)_y(c.d) =====
+    case 8: {
+      auto zxxx = mc(z, x, x, x);
+      auto xzxx = mc(x, z, x, x);
+      auto zxyy = mc(z, x, y, y);
+      auto xzyy = mc(x, z, y, y);
+      auto zxzz = mc(z, x, z, z);
+      auto xzzz = mc(x, z, z, z);
+      if (is_simple()) {
+        expr_ = zxxx - xzxx + zxyy - xzyy + zxzz - xzzz;
+        nflops_ += 5;
+      }
+    } break;
+    // ===== 9: YX = -(a×b)_y(c×d)_x =====
+    case 9: {
+      auto zxyz = mc(z, x, y, z);
+      auto zxzy = mc(z, x, z, y);
+      auto xzyz = mc(x, z, y, z);
+      auto xzzy = mc(x, z, z, y);
+      if (is_simple()) {
+        expr_ = zxzy - zxyz + xzyz - xzzy;
+        nflops_ += 3;
+      }
+    } break;
+    // ===== 10: YY = -(a×b)_y(c×d)_y =====
+    case 10: {
+      auto zxzx = mc(z, x, z, x);
+      auto zxxz = mc(z, x, x, z);
+      auto xzzx = mc(x, z, z, x);
+      auto xzxz = mc(x, z, x, z);
+      if (is_simple()) {
+        expr_ = zxxz - zxzx + xzzx - xzxz;
+        nflops_ += 3;
+      }
+    } break;
+    // ===== 11: YZ = -(a×b)_y(c×d)_z =====
+    case 11: {
+      auto zxxy = mc(z, x, x, y);
+      auto zxyx = mc(z, x, y, x);
+      auto xzxy = mc(x, z, x, y);
+      auto xzyx = mc(x, z, y, x);
+      if (is_simple()) {
+        expr_ = zxyx - zxxy + xzxy - xzyx;
+        nflops_ += 3;
+      }
+    } break;
+    // ===== 12: ZS = (a×b)_z(c.d) =====
+    case 12: {
+      auto xyxx = mc(x, y, x, x);
+      auto yxxx = mc(y, x, x, x);
+      auto xyyy = mc(x, y, y, y);
+      auto yxyy = mc(y, x, y, y);
+      auto xyzz = mc(x, y, z, z);
+      auto yxzz = mc(y, x, z, z);
+      if (is_simple()) {
+        expr_ = xyxx - yxxx + xyyy - yxyy + xyzz - yxzz;
+        nflops_ += 5;
+      }
+    } break;
+    // ===== 13: ZX = -(a×b)_z(c×d)_x =====
+    case 13: {
+      auto xyyz = mc(x, y, y, z);
+      auto xyzy = mc(x, y, z, y);
+      auto yxyz = mc(y, x, y, z);
+      auto yxzy = mc(y, x, z, y);
+      if (is_simple()) {
+        expr_ = xyzy - xyyz + yxyz - yxzy;
+        nflops_ += 3;
+      }
+    } break;
+    // ===== 14: ZY = -(a×b)_z(c×d)_y =====
+    case 14: {
+      auto xyzx = mc(x, y, z, x);
+      auto xyxz = mc(x, y, x, z);
+      auto yxzx = mc(y, x, z, x);
+      auto yxxz = mc(y, x, x, z);
+      if (is_simple()) {
+        expr_ = xyxz - xyzx + yxzx - yxxz;
+        nflops_ += 3;
+      }
+    } break;
+    // ===== 15: ZZ = -(a×b)_z(c×d)_z =====
+    case 15: {
+      auto xyxy = mc(x, y, x, y);
+      auto xyyx = mc(x, y, y, x);
+      auto yxxy = mc(y, x, x, y);
+      auto yxyx = mc(y, x, y, x);
+      if (is_simple()) {
+        expr_ = xyyx - xyxy + yxxy - yxyx;
+        nflops_ += 3;
+      }
+    } break;
+    default:
+      throw std::runtime_error(
+          "CR_11_σpσpCoulombσpσp_11: invalid component index (expected 0-15)");
+  }
+
+}  // CR_11_σpσpCoulombσpσp_11<F>::CR_11_σpσpCoulombσpσp_11
+};  // namespace libint2
+
+#endif  // LIBINT_COMP_11_ΣPΣPCOULOMBΣPΣP_11_H
diff --git "a/src/bin/libint/comp_1_\317\203pR\317\203p_1.h" "b/src/bin/libint/comp_1_\317\203pR\317\203p_1.h"
new file mode 100644
index 000000000..e698c9047
--- /dev/null
+++ "b/src/bin/libint/comp_1_\317\203pR\317\203p_1.h"
@@ -0,0 +1,161 @@
+/*
+ *  Copyright (C) 2004-2026 Edward F. Valeev
+ *
+ *  This file is part of Libint compiler.
+ *
+ *  Libint compiler is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Libint compiler is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with Libint compiler.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef LIBINT_COMP_1_ΣPRΣP_1_H
+#define LIBINT_COMP_1_ΣPRΣP_1_H
+
+#include <generic_rr.h>
+
+namespace libint2 {
+
+/**
+ * Computes the integral of \f$ \sigma \cdot \hat{p}\, r_k\, \sigma \cdot
+ * \hat{p} \f$ over CGShell/CGF by folding the 9 raw \f$ \sigma_a \partial_a r_k
+ * \sigma_b \partial_b \f$ dyadics per dipole direction \f$ k \f$ to 4
+ * Pauli-quaternion components via \f$ \sigma_a \sigma_b = \delta_{ab} +
+ * i\epsilon_{abc}\sigma_c \f$. 12 outputs total = 3 dipole directions × 4
+ * Pauli components, mirroring the σpVσp fold but with the central operator
+ * being a Cartesian dipole instead of the electrostatic potential V.
+ *
+ * @tparam F basis function type. valid choices are CGShell or CGF
+ */
+template <typename F>
+class CR_1_σpRσp_1
+    : public GenericRecurrenceRelation<
+          CR_1_σpRσp_1<F>, F, GenIntegralSet_1_1<F, σpRσpOper, EmptySet>> {
+ public:
+  typedef CR_1_σpRσp_1<F> ThisType;
+  typedef F BasisFunctionType;
+  typedef σpRσpOper OperType;
+  typedef GenIntegralSet_1_1<F, σpRσpOper, EmptySet> TargetType;
+  typedef GenericRecurrenceRelation<ThisType, BasisFunctionType, TargetType>
+      ParentType;
+  friend class GenericRecurrenceRelation<ThisType, BasisFunctionType,
+                                         TargetType>;
+  static const unsigned int max_nchildren = 100;
+
+  using ParentType::Instance;
+
+  static bool directional() { return false; }
+
+ private:
+  using ParentType::is_simple;
+  using ParentType::target_;
+  using ParentType::RecurrenceRelation::expr_;
+  using ParentType::RecurrenceRelation::nflops_;
+
+  CR_1_σpRσp_1(const std::shared_ptr<TargetType> &, unsigned int = 0);
+
+  static std::string descr() { return "CR"; }
+};
+
+template <typename F>
+CR_1_σpRσp_1<F>::CR_1_σpRσp_1(const std::shared_ptr<TargetType> &Tint,
+                              unsigned int)
+    : ParentType(Tint, 0) {
+  assert(Tint->num_func_bra(/* particle */ 0) == 1);
+  assert(Tint->num_func_ket(/* particle */ 0) == 1);
+  const auto &a = Tint->bra(0, 0);
+  const auto &b = Tint->ket(0, 0);
+  const auto &oper = Tint->oper();
+
+  // express σ·p r_k σ·p in terms of derivative integrals of the dipole
+  // operator r_k for primitive Gaussians only
+  if (a.contracted() || b.contracted()) return;
+
+  using namespace libint2::algebra;
+  using namespace libint2::prefactor;
+  using libint2::algebra::operator*;
+
+  ChildFactory<ThisType,
+               GenIntegralSet_1_1<BasisFunctionType, CartesianMultipoleOper<3u>,
+                                  EmptySet>>
+      factory(this);
+
+  constexpr auto x = 0;
+  constexpr auto y = 1;
+  constexpr auto z = 2;
+
+  F Dx_a{a};
+  Dx_a.deriv().inc(x);
+  F Dx_b{b};
+  Dx_b.deriv().inc(x);
+  F Dy_a{a};
+  Dy_a.deriv().inc(y);
+  F Dy_b{b};
+  Dy_b.deriv().inc(y);
+  F Dz_a{a};
+  Dz_a.deriv().inc(z);
+  F Dz_b{b};
+  Dz_b.deriv().inc(z);
+
+  // Build the dipole multipole descriptor for direction k.
+  const auto k = oper->descr().dipole_dir();
+  CartesianMultipole_Descr<3u> mu_k;
+  mu_k.inc(k, 1);  // r_k = (kx,ky,kz) with k_k = 1, others 0
+
+  // Pauli quaternion fold per (k, q):
+  //   q=0: trace δ_ab → Σ_a (∂_a μ | r_k | ∂_a ν)
+  //   q=1: σ_x antisym → (∂_y μ | r_k | ∂_z ν) − (∂_z μ | r_k | ∂_y ν)
+  //   q=2: σ_y antisym → (∂_z μ | r_k | ∂_x ν) − (∂_x μ | r_k | ∂_z ν)
+  //   q=3: σ_z antisym → (∂_x μ | r_k | ∂_y ν) − (∂_y μ | r_k | ∂_x ν)
+  switch (oper->descr().quaternion_index()) {
+    case 0: {
+      auto Dx_a_R_Dx_b = factory.make_child(Dx_a, Dx_b, EmptySet(), mu_k);
+      auto Dy_a_R_Dy_b = factory.make_child(Dy_a, Dy_b, EmptySet(), mu_k);
+      auto Dz_a_R_Dz_b = factory.make_child(Dz_a, Dz_b, EmptySet(), mu_k);
+      if (is_simple()) {
+        expr_ = Dx_a_R_Dx_b + Dy_a_R_Dy_b + Dz_a_R_Dz_b;
+        nflops_ += 2;
+      }
+    } break;
+    case 1: {
+      auto Dy_a_R_Dz_b = factory.make_child(Dy_a, Dz_b, EmptySet(), mu_k);
+      auto Dz_a_R_Dy_b = factory.make_child(Dz_a, Dy_b, EmptySet(), mu_k);
+      if (is_simple()) {
+        expr_ = Dy_a_R_Dz_b - Dz_a_R_Dy_b;
+        nflops_ += 1;
+      }
+    } break;
+    case 2: {
+      auto Dz_a_R_Dx_b = factory.make_child(Dz_a, Dx_b, EmptySet(), mu_k);
+      auto Dx_a_R_Dz_b = factory.make_child(Dx_a, Dz_b, EmptySet(), mu_k);
+      if (is_simple()) {
+        expr_ = Dz_a_R_Dx_b - Dx_a_R_Dz_b;
+        nflops_ += 1;
+      }
+    } break;
+    case 3: {
+      auto Dx_a_R_Dy_b = factory.make_child(Dx_a, Dy_b, EmptySet(), mu_k);
+      auto Dy_a_R_Dx_b = factory.make_child(Dy_a, Dx_b, EmptySet(), mu_k);
+      if (is_simple()) {
+        expr_ = Dx_a_R_Dy_b - Dy_a_R_Dx_b;
+        nflops_ += 1;
+      }
+    } break;
+    default:
+      throw std::runtime_error("CR_1_σpRσp_1: invalid quaternionic index");
+  }
+
+}  // CR_1_σpRσp_1<F>::CR_1_σpRσp_1
+
+};  // namespace libint2
+
+#endif  // LIBINT_COMP_1_ΣPRΣP_1_H
diff --git "a/src/bin/libint/comp_1_\317\203pV\317\203p_1.h" "b/src/bin/libint/comp_1_\317\203pV\317\203p_1.h"
index 5ecfc796b..9fbdef361 100644
--- "a/src/bin/libint/comp_1_\317\203pV\317\203p_1.h"
+++ "b/src/bin/libint/comp_1_\317\203pV\317\203p_1.h"
@@ -107,7 +107,7 @@ CR_1_σpVσp_1<F>::CR_1_σpVσp_1(const std::shared_ptr<TargetType> &Tint,
 
   // (a|W0|b) = (d a/dAx | V | d b/dBx) + (d a/dAy | V | d b/dBy) + (d a/dAz | V
   // | d b/dBz)
-  switch (oper->descr().quaternionic_index()) {
+  switch (oper->descr().quaternion_index()) {
     case 0: {
       auto Dx_a_V_Dx_b = factory.make_child(Dx_a, Dx_b, zero_m);
       auto Dy_a_V_Dy_b = factory.make_child(Dy_a, Dy_b, zero_m);
@@ -146,7 +146,7 @@ CR_1_σpVσp_1<F>::CR_1_σpVσp_1(const std::shared_ptr<TargetType> &Tint,
       }
     } break;
     default:
-      throw std::runtime_error("CR_1_σpVσp_1: invalid Pauli index");
+      throw std::runtime_error("CR_1_σpVσp_1: invalid quaternionic index");
   }
 
 }  // CR_1_σpVσp_1<F>::CR_1_σpVσp_1
diff --git a/src/bin/libint/comp_deriv_gauss_v2.h b/src/bin/libint/comp_deriv_gauss_v2.h
new file mode 100644
index 000000000..970bab0ae
--- /dev/null
+++ b/src/bin/libint/comp_deriv_gauss_v2.h
@@ -0,0 +1,547 @@
+/*
+ *  Copyright (C) 2004-2026 Edward F. Valeev
+ *
+ *  This file is part of Libint compiler.
+ *
+ *  Libint compiler is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Libint compiler is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with Libint compiler.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef _libint2_src_bin_libint_compderivgaussv2_h_
+#define _libint2_src_bin_libint_compderivgaussv2_h_
+
+#include <algebra.h>
+#include <context.h>
+#include <default_params.h>
+#include <dgvertex.h>
+#include <dims.h>
+#include <integral.h>
+#include <prefactors.h>
+#include <rr.h>
+#include <task.h>
+
+#include <cassert>
+#include <iostream>
+#include <set>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace libint2 {
+
+/** Optimized compute relation for (geometric) derivative Gaussian integrals.
+ *
+ * Like CR_DerivGauss, this expands derivative Gaussians via:
+ *   d/dr G(a) = 2*alpha * G(a+1) - a_i * G(a-1)
+ *
+ * Unlike CR_DerivGauss, this uses the HRR-like code-sharing optimization:
+ * since differentiation of a Gaussian at position (part, where) depends only
+ * on that shell's quanta (not spectator shells), we generate code once per
+ * unique differentiated shell and pass spectator dimensions at the call site.
+ *
+ * @tparam IntType integral type
+ * @tparam part particle index of the function to be differentiated
+ * @tparam where position of the function to be differentiated (InBra/InKet)
+ * @tparam trans_inv_part if non-negative, specifies the particle index for
+ *         translational invariance
+ * @tparam trans_inv_where position for translational invariance
+ */
+template <class IntType, int part, FunctionPosition where,
+          int trans_inv_part = -1, FunctionPosition trans_inv_where = InBra>
+class DerivGaussV2 : public RecurrenceRelation {
+ private:
+  static constexpr auto trans_inv_oper =
+      not IntType::OperType::Properties::odep;
+  static constexpr auto using_trans_inv =
+      trans_inv_oper && (part == trans_inv_part) && (where == trans_inv_where);
+
+ public:
+  typedef RecurrenceRelation ParentType;
+  typedef typename IntType::BasisFunctionType BasisFunctionType;
+  typedef DerivGaussV2 ThisType;
+  typedef IntType TargetType;
+  typedef IntType ChildType;
+  typedef RecurrenceRelation::ExprType ExprType;
+
+  static const unsigned int max_nchildren_ =
+      using_trans_inv ? (IntType::num_bf - 1) : 2u;
+
+  static std::shared_ptr<ThisType> Instance(
+      const std::shared_ptr<TargetType>& Tint, unsigned int dir = 0);
+  virtual ~DerivGaussV2() {}
+
+  /// always directional
+  static bool directional() { return true; }
+
+  unsigned int num_children() const override { return nchildren_; }
+  std::shared_ptr<DGVertex> rr_target() const override {
+    return std::static_pointer_cast<DGVertex, TargetType>(target_);
+  }
+  std::shared_ptr<DGVertex> rr_child(unsigned int i) const override {
+    return children_.at(i);
+  }
+  bool is_simple() const override {
+    return TrivialBFSet<BasisFunctionType>::result;
+  }
+
+  std::string spfunction_call(
+      const std::shared_ptr<CodeContext>& context,
+      const std::shared_ptr<ImplicitDimensions>& dims) const override;
+
+ private:
+  DerivGaussV2(const std::shared_ptr<TargetType>& Tint, unsigned int dir);
+
+  unsigned int dir_;
+  std::shared_ptr<TargetType> target_;
+  std::vector<std::shared_ptr<DGVertex>> children_;
+  unsigned int nchildren_;
+
+  std::string generate_label() const override;
+  std::shared_ptr<ImplicitDimensions> adapt_dims_(
+      const std::shared_ptr<ImplicitDimensions>& dims) const override;
+  bool register_with_rrstack() const;
+  bool expl_high_dim() const;
+  bool expl_low_dim() const;
+
+  /// add child, deduplicating
+  const std::shared_ptr<DGVertex>& add_child(
+      const std::shared_ptr<DGVertex>& child) {
+    for (auto& c : children_) {
+      if (c == child) return c;
+    }
+    children_.push_back(child);
+    ++nchildren_;
+    return children_.back();
+  }
+};
+
+//
+// Implementation
+//
+
+template <class IntType, int part, FunctionPosition where, int trans_inv_part,
+          FunctionPosition trans_inv_where>
+std::shared_ptr<
+    DerivGaussV2<IntType, part, where, trans_inv_part, trans_inv_where>>
+DerivGaussV2<IntType, part, where, trans_inv_part, trans_inv_where>::Instance(
+    const std::shared_ptr<TargetType>& Tint, unsigned int dir) {
+  std::shared_ptr<ThisType> this_ptr(new ThisType(Tint, dir));
+  if (this_ptr->num_children() != 0) {
+    this_ptr->register_with_rrstack();
+    return this_ptr;
+  }
+  return std::shared_ptr<ThisType>();
+}
+
+template <class IntType, int part, FunctionPosition where, int trans_inv_part,
+          FunctionPosition trans_inv_where>
+DerivGaussV2<IntType, part, where, trans_inv_part, trans_inv_where>::
+    DerivGaussV2(const std::shared_ptr<TargetType>& Tint, unsigned int dir)
+    : dir_(dir), target_(Tint), nchildren_(0) {
+  using namespace libint2::algebra;
+  using namespace libint2::prefactor;
+  using namespace libint2::braket;
+  typedef BasisFunctionType F;
+  const F& _1 = unit<F>(is_simple() ? dir : 0);
+
+  const typename IntType::AuxQuantaType& aux = Tint->aux();
+  const typename IntType::OperType& oper = Tint->oper();
+
+  children_.reserve(max_nchildren_);
+
+  // the Gaussian must be differentiated in direction dir
+  {
+    if (where == InBra && Tint->bra(part, 0).deriv().d(dir) == 0) return;
+    if (where == InKet && Tint->ket(part, 0).deriv().d(dir) == 0) return;
+  }
+
+  // if not using translational invariance, can only expand primitives
+  if (not using_trans_inv) {
+    if (where == InBra && Tint->bra(part, 0).contracted()) return;
+    if (where == InKet && Tint->ket(part, 0).contracted()) return;
+  }
+
+  typedef typename IntType::BraType IBraType;
+  typedef typename IntType::KetType IKetType;
+  IBraType* bra = new IBraType(Tint->bra());
+  IKetType* ket = new IKetType(Tint->ket());
+
+  if (not using_trans_inv) {  // differentiate
+
+    if (where == InBra) {
+      F a(bra->member(part, 0));
+
+      // add a+1
+      F ap1(bra->member(part, 0) + _1);
+      ap1.deriv().dec(dir);
+      bra->set_member(ap1, part, 0);
+      auto int_ap1 = add_child(IntType::Instance(*bra, *ket, aux, oper));
+      bra->set_member(a, part, 0);
+      if (is_simple()) {
+        std::ostringstream oss;
+        oss << "two_alpha" << part << "_bra";
+        expr_ = Scalar(oss.str()) * int_ap1;
+        nflops_ += 1;
+      }
+
+      // See if a-1 exists
+      F am1(bra->member(part, 0) - _1);
+      if (exists(am1)) {
+        am1.deriv().dec(dir);
+        bra->set_member(am1, part, 0);
+        auto int_am1 = add_child(IntType::Instance(*bra, *ket, aux, oper));
+        bra->set_member(a, part, 0);
+        if (is_simple()) {
+          expr_ -= Scalar(a[dir]) * int_am1;
+          nflops_ += 2;
+        }
+      }
+      delete bra;
+      delete ket;
+      return;
+    }
+
+    if (where == InKet) {
+      F a(ket->member(part, 0));
+
+      // add a+1
+      F ap1(ket->member(part, 0) + _1);
+      ap1.deriv().dec(dir);
+      ket->set_member(ap1, part, 0);
+      auto int_ap1 = add_child(IntType::Instance(*bra, *ket, aux, oper));
+      ket->set_member(a, part, 0);
+      if (is_simple()) {
+        std::ostringstream oss;
+        oss << "two_alpha" << part << "_ket";
+        expr_ = Scalar(oss.str()) * int_ap1;
+        nflops_ += 1;
+      }
+
+      // See if a-1 exists
+      F am1(ket->member(part, 0) - _1);
+      if (exists(am1)) {
+        am1.deriv().dec(dir);
+        ket->set_member(am1, part, 0);
+        auto int_am1 = add_child(IntType::Instance(*bra, *ket, aux, oper));
+        ket->set_member(a, part, 0);
+        if (is_simple()) {
+          expr_ -= Scalar(a[dir]) * int_am1;
+          nflops_ += 2;
+        }
+      }
+      delete bra;
+      delete ket;
+      return;
+    }
+
+  } else {  // use translational invariance
+
+    // remove one deriv quantum from the target function
+    if (where == InBra) bra->member(part, 0).deriv().dec(dir);
+    if (where == InKet) ket->member(part, 0).deriv().dec(dir);
+
+    int term_count = 0;
+    for (int p = 0; p != IntType::num_particles; ++p) {
+      typedef BasisFunctionType F;
+      if (p != trans_inv_part || trans_inv_where != InBra) {
+        F a(bra->member(p, 0));
+        if (not a.is_unit()) {
+          F da(a);
+          da.deriv().inc(dir);
+          bra->set_member(da, p, 0);
+          auto int_da = add_child(IntType::Instance(*bra, *ket, aux, oper));
+          bra->set_member(a, p, 0);
+          if (is_simple()) {
+            if (term_count == 0)
+              expr_ = Scalar(-1) * int_da;
+            else
+              expr_ -= int_da;
+            ++term_count;
+            nflops_ += 1;
+          }
+        }
+      }
+      if (p != trans_inv_part || trans_inv_where != InKet) {
+        F a(ket->member(p, 0));
+        if (not a.is_unit()) {
+          F da(a);
+          da.deriv().inc(dir);
+          ket->set_member(da, p, 0);
+          auto int_da = add_child(IntType::Instance(*bra, *ket, aux, oper));
+          ket->set_member(a, p, 0);
+          if (is_simple()) {
+            if (term_count == 0)
+              expr_ = Scalar(-1) * int_da;
+            else
+              expr_ -= int_da;
+            ++term_count;
+            nflops_ += 1;
+          }
+        }
+      }
+    }
+  }
+
+  delete bra;
+  delete ket;
+}
+
+template <class IntType, int part, FunctionPosition where, int trans_inv_part,
+          FunctionPosition trans_inv_where>
+bool DerivGaussV2<IntType, part, where, trans_inv_part,
+                  trans_inv_where>::register_with_rrstack() const {
+  using std::swap;
+
+  // only register RRs for shell sets (not individual integrals)
+  if (TrivialBFSet<BasisFunctionType>::result) return false;
+
+  // translational invariance path not optimized yet — register as-is
+  if (using_trans_inv) {
+    std::shared_ptr<RRStack> rrstack = RRStack::Instance();
+    std::shared_ptr<ThisType> this_ptr =
+        std::const_pointer_cast<ThisType, const ThisType>(
+            std::static_pointer_cast<const ThisType, const ParentType>(
+                std::enable_shared_from_this<ParentType>::shared_from_this()));
+    rrstack->find(this_ptr);
+    return true;
+  }
+
+  typedef typename IntType::BraType IBraType;
+  typedef typename IntType::KetType IKetType;
+  const IBraType& bra = target_->bra();
+  const IKetType& ket = target_->ket();
+
+  // check if all spectator shells already have zero quanta
+  bool nonzero_quanta = false;
+  unsigned const int npart = IntType::OperatorType::Properties::np;
+  for (unsigned int p = 0; p < npart; p++) {
+    int nfbra = bra.num_members(p);
+    for (int f = 0; f < nfbra; f++) {
+      // skip the differentiated position
+      if (static_cast<int>(p) == part && where == InBra) continue;
+      if (!bra.member(p, f).zero() || !bra.member(p, f).deriv().zero())
+        nonzero_quanta = true;
+    }
+    int nfket = ket.num_members(p);
+    for (int f = 0; f < nfket; f++) {
+      if (static_cast<int>(p) == part && where == InKet) continue;
+      if (!ket.member(p, f).zero() || !ket.member(p, f).deriv().zero())
+        nonzero_quanta = true;
+    }
+  }
+
+  // if all spectators are zero, register this instance directly
+  if (!nonzero_quanta) {
+    std::shared_ptr<RRStack> rrstack = RRStack::Instance();
+    std::shared_ptr<ThisType> this_ptr =
+        std::const_pointer_cast<ThisType, const ThisType>(
+            std::static_pointer_cast<const ThisType, const ParentType>(
+                std::enable_shared_from_this<ParentType>::shared_from_this()));
+    rrstack->find(this_ptr);
+    return true;
+  }
+
+  // Otherwise, zero out all spectator shells and register a dummy
+  IBraType bra_zero(bra);
+  IKetType ket_zero(ket);
+  for (unsigned int p = 0; p < npart; p++) {
+    int nfbra = bra_zero.num_members(p);
+    for (int f = 0; f < nfbra; f++) {
+      if (static_cast<int>(p) == part && where == InBra) continue;
+      typedef typename IBraType::bfs_type bfs_type;
+      typedef typename IBraType::bfs_ref bfs_ref;
+      bfs_ref bfs = bra_zero.member(p, f);
+      if (!bfs.zero() || !bfs.deriv().zero()) {
+        bfs_type null_bfs;
+        swap(bfs, null_bfs);
+      }
+    }
+    int nfket = ket_zero.num_members(p);
+    for (int f = 0; f < nfket; f++) {
+      if (static_cast<int>(p) == part && where == InKet) continue;
+      typedef typename IKetType::bfs_type bfs_type;
+      typedef typename IKetType::bfs_ref bfs_ref;
+      bfs_ref bfs = ket_zero.member(p, f);
+      if (!bfs.zero() || !bfs.deriv().zero()) {
+        bfs_type null_bfs;
+        swap(bfs, null_bfs);
+      }
+    }
+  }
+
+  // create a generic integral with a dummy operator
+  typedef GenOper<GenMultSymmOper_Descr<IntType::OperatorType::Properties::np>>
+      DummyOper;
+  typedef EmptySet DummyQuanta;
+  typedef GenIntegralSet<DummyOper, IncableBFSet, IBraType, IKetType,
+                         DummyQuanta>
+      DummyIntegral;
+  DummyOper dummy_oper;
+  DummyQuanta dummy_quanta(std::vector<int>(0, 0));
+  std::shared_ptr<DummyIntegral> dummy_integral =
+      DummyIntegral::Instance(bra_zero, ket_zero, dummy_quanta, dummy_oper);
+
+  // construct a DerivGaussV2 over the dummy integral and register it
+  typedef DerivGaussV2<DummyIntegral, part, where> DummyDerivGaussV2;
+  std::shared_ptr<DummyDerivGaussV2> dummy_rr =
+      DummyDerivGaussV2::Instance(dummy_integral, dir_);
+  std::shared_ptr<RRStack> rrstack = RRStack::Instance();
+  rrstack->find(dummy_rr);
+  return true;
+}
+
+template <class IntType, int part, FunctionPosition where, int trans_inv_part,
+          FunctionPosition trans_inv_where>
+std::string DerivGaussV2<IntType, part, where, trans_inv_part,
+                         trans_inv_where>::generate_label() const {
+  std::ostringstream os;
+
+  // For translational invariance, children depend on ALL shells, so
+  // the label must include full integral info (no code sharing).
+  // For direct differentiation, only the differentiated shell matters.
+  if constexpr (using_trans_inv) {
+    typedef typename TargetType::AuxIndexType mType;
+    static std::shared_ptr<mType> aux0(new mType(0u));
+    os << "CR_DerivGauss"
+       << "P" << part << to_string(where)
+       << genintegralset_label(target_->bra(), target_->ket(), aux0,
+                               target_->oper());
+    return os.str();
+  }
+
+  os << "DerivGaussV2 P" << part << " " << to_string(where) << " ";
+
+  // Only encode the differentiated shell — not spectators
+  if (where == InBra) {
+    BasisFunctionType sh(target_->bra(part, 0));
+    sh.uncontract();
+    os << sh.label();
+  } else {
+    BasisFunctionType sh(target_->ket(part, 0));
+    sh.uncontract();
+    os << sh.label();
+  }
+
+  return os.str();
+}
+
+template <class IntType, int part, FunctionPosition where, int trans_inv_part,
+          FunctionPosition trans_inv_where>
+std::string
+DerivGaussV2<IntType, part, where, trans_inv_part, trans_inv_where>::
+    spfunction_call(const std::shared_ptr<CodeContext>& context,
+                    const std::shared_ptr<ImplicitDimensions>& dims) const {
+  std::ostringstream os;
+  os << context->label_to_function_name(label()) << "(inteval, "
+     << context->value_to_pointer(rr_target()->symbol());
+
+  const unsigned int nc = num_children();
+  for (unsigned int c = 0; c < nc; c++) {
+    os << ", " << context->value_to_pointer(rr_child(c)->symbol());
+  }
+
+  // compute hsr and lsr — dimensions of spectator shells
+  // canonical order: for each particle p, bra then ket
+  // hsr = product of dims before (part, where)
+  // lsr = product of dims after (part, where)
+  unsigned int hsr = 1;
+  unsigned int lsr = 1;
+  const unsigned int np = IntType::OperType::Properties::np;
+  for (int p = 0; p < static_cast<int>(np); p++) {
+    unsigned int nbra = target_->bra().num_members(p);
+    assert(nbra == 1);
+    for (unsigned int i = 0; i < nbra; i++) {
+      SubIterator* iter = target_->bra().member_subiter(p, i);
+      if (p < part || (p == part && where == InKet)) hsr *= iter->num_iter();
+      // skip p == part && where == InBra (the differentiated shell)
+      if (p > part) lsr *= iter->num_iter();
+      delete iter;
+    }
+    unsigned int nket = target_->ket().num_members(p);
+    assert(nket == 1);
+    for (unsigned int i = 0; i < nket; i++) {
+      SubIterator* iter = target_->ket().member_subiter(p, i);
+      if (p < part) hsr *= iter->num_iter();
+      // skip p == part && where == InKet (the differentiated shell)
+      if (p > part || (p == part && where == InBra)) lsr *= iter->num_iter();
+      delete iter;
+    }
+  }
+
+  // Use TaskParameters to keep track of maximum ranks
+  LibraryTaskManager& taskmgr = LibraryTaskManager::Instance();
+  taskmgr.current().params()->max_hrr_hsrank(hsr);
+
+  if (expl_high_dim()) os << "," << hsr;
+  if (expl_low_dim()) os << "," << lsr;
+  os << ")" << context->end_of_stat() << std::endl;
+  return os.str();
+}
+
+template <class IntType, int part, FunctionPosition where, int trans_inv_part,
+          FunctionPosition trans_inv_where>
+bool DerivGaussV2<IntType, part, where, trans_inv_part,
+                  trans_inv_where>::expl_high_dim() const {
+  // translational invariance: no code sharing, no explicit dims
+  if (using_trans_inv) return false;
+  // need explicit high dim unless this is the first position
+  if (part == 0 && where == InBra) return false;
+  return true;
+}
+
+template <class IntType, int part, FunctionPosition where, int trans_inv_part,
+          FunctionPosition trans_inv_where>
+bool DerivGaussV2<IntType, part, where, trans_inv_part,
+                  trans_inv_where>::expl_low_dim() const {
+  // translational invariance: no code sharing, no explicit dims
+  if (using_trans_inv) return false;
+  unsigned int np = IntType::OperType::Properties::np;
+  // need explicit low dim unless this is the last position
+  if (static_cast<int>(np) - 1 == part && where == InKet) return false;
+  // corner case: 1-particle operator
+  if (np == 1) return true;
+  return true;
+}
+
+template <class IntType, int part, FunctionPosition where, int trans_inv_part,
+          FunctionPosition trans_inv_where>
+std::shared_ptr<ImplicitDimensions>
+DerivGaussV2<IntType, part, where, trans_inv_part, trans_inv_where>::
+    adapt_dims_(const std::shared_ptr<ImplicitDimensions>& dims) const {
+  bool high_rank = expl_high_dim();
+  bool low_rank = expl_low_dim();
+
+  std::shared_ptr<Entity> high_dim, low_dim;
+  if (high_rank) {
+    high_dim =
+        std::shared_ptr<Entity>(new RTimeEntity<EntityTypes::Int>("highdim"));
+  } else {
+    high_dim = dims->high();
+  }
+  if (low_rank) {
+    low_dim =
+        std::shared_ptr<Entity>(new RTimeEntity<EntityTypes::Int>("lowdim"));
+  } else {
+    low_dim = dims->low();
+  }
+
+  std::shared_ptr<ImplicitDimensions> localdims(
+      new ImplicitDimensions(high_dim, low_dim, dims->vecdim()));
+  return localdims;
+}
+
+};  // namespace libint2
+
+#endif
diff --git a/src/bin/libint/dg.cc b/src/bin/libint/dg.cc
index eded6a54f..9e03277ae 100644
--- a/src/bin/libint/dg.cc
+++ b/src/bin/libint/dg.cc
@@ -499,10 +499,11 @@ void DirectedGraph::apply_to(const std::shared_ptr<DGVertex>& vertex,
 
 // Optimize out simple recurrence relations
 void DirectedGraph::optimize_rr_out(
-    const std::shared_ptr<CodeContext>& context) {
+    const std::shared_ptr<CodeContext>& context,
+    const std::shared_ptr<ImplicitDimensions>& dims) {
   replace_rr_with_expr();
   remove_trivial_arithmetics();
-  handle_trivial_nodes(context);
+  handle_trivial_nodes(context, dims);
   remove_disconnected_vertices();
   find_subtrees();
 }
@@ -797,7 +798,8 @@ inline std::string to_vector_symbol(const std::shared_ptr<DGVertex>& v) {
 // refer to another node so that no code is generated for it.
 //
 void DirectedGraph::handle_trivial_nodes(
-    const std::shared_ptr<CodeContext>& context) {
+    const std::shared_ptr<CodeContext>& context,
+    const std::shared_ptr<ImplicitDimensions>& dims) {
   typedef vertices::iterator iter;
   for (iter v = stack_.begin(); v != stack_.end(); ++v) {
     const ver_ptr& vptr = vertex_ptr(*v);
@@ -821,8 +823,6 @@ void DirectedGraph::handle_trivial_nodes(
           // if (child->symbol_set() == false)
           {
             const std::string stack_name("stack");
-            const std::shared_ptr<ImplicitDimensions>& dims =
-                ImplicitDimensions::default_dims();
             std::string low_rank = dims->low_label();
             std::string veclen = dims->vecdim_label();
 
diff --git a/src/bin/libint/dg.h b/src/bin/libint/dg.h
index 0f85abc8f..19e96074c 100644
--- a/src/bin/libint/dg.h
+++ b/src/bin/libint/dg.h
@@ -22,6 +22,7 @@
 #define _libint2_src_bin_libint_dg_h_
 
 #include <dgvertex.h>
+#include <dims.h>
 #include <exception.h>
 #include <global_macros.h>
 #include <key.h>
@@ -253,7 +254,9 @@ class DirectedGraph : public std::enable_shared_from_this<DirectedGraph> {
       optimized away. optimize_rr_out() will replace all simple recurrence
      relations with code representing them.
    */
-  void optimize_rr_out(const std::shared_ptr<CodeContext>& context);
+  void optimize_rr_out(const std::shared_ptr<CodeContext>& context,
+                       const std::shared_ptr<ImplicitDimensions>& dims =
+                           ImplicitDimensions::default_dims());
 
   /** after all apply's have been called, traverse()
       construct a heuristic order of traversal for the graph.
@@ -438,7 +441,9 @@ class DirectedGraph : public std::enable_shared_from_this<DirectedGraph> {
   to their equivalents (such as (ss|ss) shell quartet can only be connected to
   (ss|ss) integral)
    */
-  void handle_trivial_nodes(const std::shared_ptr<CodeContext>& context);
+  void handle_trivial_nodes(const std::shared_ptr<CodeContext>& context,
+                            const std::shared_ptr<ImplicitDimensions>& dims =
+                                ImplicitDimensions::default_dims());
   /// This functions removes vertices not connected to other vertices
   void remove_disconnected_vertices();
   /** Finds (binary) subtrees. The subtrees correspond to a single-line code (no
diff --git a/src/bin/libint/gauss.cc b/src/bin/libint/gauss.cc
index 3899283ab..aa60b1de7 100644
--- a/src/bin/libint/gauss.cc
+++ b/src/bin/libint/gauss.cc
@@ -115,8 +115,15 @@ std::string CGF::label() const {
   unsigned int am = qn_[0] + qn_[1] + qn_[2];
   std::string deriv_label;
   if (!deriv_.zero()) deriv_label = deriv_.label();
-  const std::string am_string = am_to_symbol(am, contracted());
+  std::string am_string = am_to_symbol(am, contracted());
   std::ostringstream oss;
+
+  // Some OSs can have case-insensitive filesystem e.g., MacOS. So here we add
+  // additional identifier for primitive function labels
+  if (!this->contracted()) {
+    am_string += "_";
+  }
+
   oss << (pure_sh_ && am > 0 ? "W" : "") << am_string << deriv_label << "_";
   if (am == 0) return oss.str();
 
@@ -223,8 +230,16 @@ CGShell::~CGShell() {}
 
 std::string CGShell::label() const {
   if (is_unit()) return "unit";
-  std::string result = std::string(pure_sh_ && qn_[0] > 0 ? "W" : "") +
-                       am_to_symbol(qn_[0], contracted());
+  std::string am_symbol = am_to_symbol(qn_[0], contracted());
+
+  // Some OSs can have case-insensitive filesystem e.g., MacOS. So here we add
+  // additional identifier for primitive shell labels
+  if (!this->contracted()) {
+    am_symbol += "_";
+  }
+
+  std::string result =
+      std::string(pure_sh_ && qn_[0] > 0 ? "W" : "") + am_symbol;
   if (!deriv_.zero()) result += deriv_.label();
   return result;
 }
diff --git a/src/bin/libint/master_ints_list.h b/src/bin/libint/master_ints_list.h
index 1aa8c3e64..158052fba 100644
--- a/src/bin/libint/master_ints_list.h
+++ b/src/bin/libint/master_ints_list.h
@@ -21,12 +21,15 @@
 #ifndef _libint2_src_bin_libint_masterintslist_h_
 #define _libint2_src_bin_libint_masterintslist_h_
 
-// need extra-long mpl list
+// need extra-long mpl list — split across two sub-lists and joined via
+// boost::mpl::joint_view, since boost only ships pre-generated list headers up
+// to size 50.
 #define BOOST_MPL_CFG_NO_PREPROCESSED_HEADERS
 #define BOOST_MPL_LIMIT_LIST_SIZE 50
 #include <bfset.h>
 #include <oper.h>
 
+#include <boost/mpl/joint_view.hpp>
 #include <boost/mpl/list.hpp>
 #if LIBINT_SUPPORT_ONEBODYINTS
 #include <integral_1_1.h>
@@ -47,6 +50,8 @@ typedef GenIntegralSet_1_1<CGShell, ElecPotOper, mType> ElecPot_1_1_sh;
 typedef GenIntegralSet_1_1<CGF, ElecPotOper, mType> ElecPot_1_1_int;
 typedef GenIntegralSet_1_1<CGShell, σpVσpOper, EmptySet> σpVσp_1_1_sh;
 typedef GenIntegralSet_1_1<CGF, σpVσpOper, EmptySet> σpVσp_1_1_int;
+typedef GenIntegralSet_1_1<CGShell, σpRσpOper, EmptySet> σpRσp_1_1_sh;
+typedef GenIntegralSet_1_1<CGF, σpRσpOper, EmptySet> σpRσp_1_1_int;
 typedef GenIntegralSet_1_1<CGShell, CartesianMultipoleOper<3u>, EmptySet>
     CMultipole_1_1_sh;
 typedef GenIntegralSet_1_1<CGF, CartesianMultipoleOper<3u>, EmptySet>
@@ -106,6 +111,16 @@ typedef GenIntegralSet_1_1<CGF1d<CartesianAxis_Z>, CartesianMultipoleOper<1u>,
 //////////////////////////
 typedef GenIntegralSet_11_11<CGShell, TwoPRep, mType> TwoPRep_11_11_sq;
 typedef GenIntegralSet_11_11<CGF, TwoPRep, mType> TwoPRep_11_11_int;
+typedef GenIntegralSet_11_11<CGShell, CoulombσpσpOper, mType>
+    Coulombσpσp_11_11_sq;
+typedef GenIntegralSet_11_11<CGF, CoulombσpσpOper, mType> Coulombσpσp_11_11_int;
+typedef GenIntegralSet_11_11<CGShell, σpσpCoulombσpσpOper, mType>
+    σpσpCoulombσpσp_11_11_sq;
+typedef GenIntegralSet_11_11<CGF, σpσpCoulombσpσpOper, mType>
+    σpσpCoulombσpσp_11_11_int;
+typedef GenIntegralSet_11_11<CGShell, σpCoulombσpOper, mType>
+    σpCoulombσp_11_11_sq;
+typedef GenIntegralSet_11_11<CGF, σpCoulombσpOper, mType> σpCoulombσp_11_11_int;
 typedef GenIntegralSet_11_11<CGShell, R12kG12, mType> R12kG12_11_11_sq;
 typedef GenIntegralSet_11_11<CGF, R12kG12, mType> R12kG12_11_11_int;
 typedef GenIntegralSet_11_11<CGShell, R12kR12lG12, EmptySet>
@@ -138,17 +153,24 @@ typedef boost::mpl::list<
     Overlap_1_1_sh_y, Overlap_1_1_int_y, Overlap_1_1_sh_z, Overlap_1_1_int_z,
     Kinetic_1_1_sh, Kinetic_1_1_int, Kinetic_1_1_sh_x, Kinetic_1_1_int_x,
     Kinetic_1_1_sh_y, Kinetic_1_1_int_y, Kinetic_1_1_sh_z, Kinetic_1_1_int_z,
-    ElecPot_1_1_sh, ElecPot_1_1_int, σpVσp_1_1_sh, σpVσp_1_1_int,
-    CMultipole_1_1_sh, CMultipole_1_1_int, CMultipole_1_1_sh_x,
+    ElecPot_1_1_sh, ElecPot_1_1_int, σpVσp_1_1_sh, σpVσp_1_1_int, σpRσp_1_1_sh,
+    σpRσp_1_1_int, CMultipole_1_1_sh, CMultipole_1_1_int, CMultipole_1_1_sh_x,
     CMultipole_1_1_sh_y, CMultipole_1_1_sh_z, CMultipole_1_1_int_x,
     CMultipole_1_1_int_y, CMultipole_1_1_int_z, SMultipole_1_1_sh,
-    SMultipole_1_1_int,
 #endif
-    TwoPRep_11_11_sq, TwoPRep_11_11_int, R12kG12_11_11_sq, R12kG12_11_11_int,
-    R12kR12lG12_11_11_sq, R12kR12lG12_11_11_int, TiG12_11_11_sq,
-    TiG12_11_11_int, G12TiG12_11_11_sq, G12TiG12_11_11_int,
+    SMultipole_1_1_int>
+    MasterIntegralTypeList_1body_part;
+typedef boost::mpl::list<
+    TwoPRep_11_11_sq, TwoPRep_11_11_int, Coulombσpσp_11_11_sq,
+    Coulombσpσp_11_11_int, σpσpCoulombσpσp_11_11_sq, σpσpCoulombσpσp_11_11_int,
+    σpCoulombσp_11_11_sq, σpCoulombσp_11_11_int, R12kG12_11_11_sq,
+    R12kG12_11_11_int, R12kR12lG12_11_11_sq, R12kR12lG12_11_11_int,
+    TiG12_11_11_sq, TiG12_11_11_int, G12TiG12_11_11_sq, G12TiG12_11_11_int,
     DivG12prime_xTx_11_11_sq, DivG12prime_xTx_11_11_int,
     DummySymmIntegral_11_11_sq, DummySymmIntegral_11_11_int>
+    MasterIntegralTypeList_2body_part;
+typedef boost::mpl::joint_view<MasterIntegralTypeList_1body_part,
+                               MasterIntegralTypeList_2body_part>
     MasterIntegralTypeList;
 
 };  // namespace libint2
diff --git a/src/bin/libint/master_rrs_list.h b/src/bin/libint/master_rrs_list.h
index f3ec4e2d2..ae4e32311 100644
--- a/src/bin/libint/master_rrs_list.h
+++ b/src/bin/libint/master_rrs_list.h
@@ -21,12 +21,17 @@
 #ifndef _libint2_src_bin_libint_masterrrslist_h_
 #define _libint2_src_bin_libint_masterrrslist_h_
 
+#include <comp_11_Coulombσpσp_11.h>
 #include <comp_11_DivG12prime_xTx_11.h>
 #include <comp_11_g12tig12_11.h>
+#include <comp_11_σpCoulombσp_11.h>
 #include <comp_11_r12kr12lg12_11.h>
 #include <comp_11_tig12_11.h>
+#include <comp_11_σpσpCoulombσpσp_11.h>
+#include <comp_1_σpRσp_1.h>
 #include <comp_1_σpVσp_1.h>
 #include <comp_deriv_gauss.h>
+#include <comp_deriv_gauss_v2.h>
 #include <comp_xyz.h>
 #include <generic_rr.h>
 #include <hrr.h>
@@ -178,6 +183,9 @@ typedef VRR_1_ElecPot_1<CGF, InKet> VRR_b_1_ElecPot_1_int;
 typedef CR_1_σpVσp_1<CGShell> CR_1_σpVσp_1_sh;
 typedef CR_1_σpVσp_1<CGF> CR_1_σpVσp_1_int;
 
+typedef CR_1_σpRσp_1<CGShell> CR_1_σpRσp_1_sh;
+typedef CR_1_σpRσp_1<CGF> CR_1_σpRσp_1_int;
+
 // TODO investigate whether need to stay away from HRR for now to be sure that
 // multipoles are computed as precisely as possible
 typedef HRR<SMultipole_1_1_sh, CGShell, 0, InBra, 0, InKet, 0>
@@ -266,6 +274,59 @@ typedef CR_DerivGauss<TwoPRep_11_11_int, 1, InKet, trinvskip2_part,
                       trinvskip2_where>
     Deriv_d_11_TwoPRep_11_int;
 
+// DerivGaussV2 for TwoPRep (shell sets)
+typedef DerivGaussV2<TwoPRep_11_11_sq, 0, InBra, trinvskip2_part,
+                     trinvskip2_where>
+    DerivV2_a_11_TwoPRep_11_sh;
+typedef DerivGaussV2<TwoPRep_11_11_sq, 0, InKet, trinvskip2_part,
+                     trinvskip2_where>
+    DerivV2_b_11_TwoPRep_11_sh;
+typedef DerivGaussV2<TwoPRep_11_11_sq, 1, InBra, trinvskip2_part,
+                     trinvskip2_where>
+    DerivV2_c_11_TwoPRep_11_sh;
+typedef DerivGaussV2<TwoPRep_11_11_sq, 1, InKet, trinvskip2_part,
+                     trinvskip2_where>
+    DerivV2_d_11_TwoPRep_11_sh;
+// DerivGaussV2 for TwoPRep (individual integrals)
+typedef DerivGaussV2<TwoPRep_11_11_int, 0, InBra, trinvskip2_part,
+                     trinvskip2_where>
+    DerivV2_a_11_TwoPRep_11_int;
+typedef DerivGaussV2<TwoPRep_11_11_int, 0, InKet, trinvskip2_part,
+                     trinvskip2_where>
+    DerivV2_b_11_TwoPRep_11_int;
+typedef DerivGaussV2<TwoPRep_11_11_int, 1, InBra, trinvskip2_part,
+                     trinvskip2_where>
+    DerivV2_c_11_TwoPRep_11_int;
+typedef DerivGaussV2<TwoPRep_11_11_int, 1, InKet, trinvskip2_part,
+                     trinvskip2_where>
+    DerivV2_d_11_TwoPRep_11_int;
+
+// DerivGaussV2 for DummySymmIntegral (used by register_with_rrstack)
+typedef DerivGaussV2<DummySymmIntegral_11_11_sq, 0, InBra>
+    DerivV2_a_11_Dummy_11_sh;
+typedef DerivGaussV2<DummySymmIntegral_11_11_sq, 0, InKet>
+    DerivV2_b_11_Dummy_11_sh;
+typedef DerivGaussV2<DummySymmIntegral_11_11_sq, 1, InBra>
+    DerivV2_c_11_Dummy_11_sh;
+typedef DerivGaussV2<DummySymmIntegral_11_11_sq, 1, InKet>
+    DerivV2_d_11_Dummy_11_sh;
+typedef DerivGaussV2<DummySymmIntegral_11_11_int, 0, InBra>
+    DerivV2_a_11_Dummy_11_int;
+typedef DerivGaussV2<DummySymmIntegral_11_11_int, 0, InKet>
+    DerivV2_b_11_Dummy_11_int;
+typedef DerivGaussV2<DummySymmIntegral_11_11_int, 1, InBra>
+    DerivV2_c_11_Dummy_11_int;
+typedef DerivGaussV2<DummySymmIntegral_11_11_int, 1, InKet>
+    DerivV2_d_11_Dummy_11_int;
+
+typedef CR_11_Coulombσpσp_11<CGShell> CR_11_Coulombσpσp_11_sh;
+typedef CR_11_Coulombσpσp_11<CGF> CR_11_Coulombσpσp_11_int;
+
+typedef CR_11_σpσpCoulombσpσp_11<CGShell> CR_11_σpσpCoulombσpσp_11_sh;
+typedef CR_11_σpσpCoulombσpσp_11<CGF> CR_11_σpσpCoulombσpσp_11_int;
+
+typedef CR_11_σpCoulombσp_11<CGShell> CR_11_σpCoulombσp_11_sh;
+typedef CR_11_σpCoulombσp_11<CGF> CR_11_σpCoulombσp_11_int;
 };  // namespace libint2
 
 #endif  // header guard
diff --git a/src/bin/libint/oper.h b/src/bin/libint/oper.h
index 6afbff587..29b367378 100644
--- a/src/bin/libint/oper.h
+++ b/src/bin/libint/oper.h
@@ -289,23 +289,22 @@ BOOST_PP_LIST_FOR_EACH(BOOST_PP_DECLARE_HERMITIAN_ONEBODY_DESCRIPTOR,
 struct σpVσp_Descr : public Contractable<σpVσp_Descr> {
   typedef MultiplicativeODep1Body_Props Properties;
 
-  σpVσp_Descr() : quaternionic_index_(0) {}
-  σpVσp_Descr(int quaternionic_index)
-      : quaternionic_index_(quaternionic_index) {
-    assert(quaternionic_index <= 3);
+  σpVσp_Descr() : quaternion_index_(0) {}
+  σpVσp_Descr(int quaternion_index) : quaternion_index_(quaternion_index) {
+    assert(quaternion_index <= 3);
   }
 
   static const unsigned int max_key = 4;
-  unsigned int key() const { return quaternionic_index(); }
+  unsigned int key() const { return quaternion_index(); }
   std::string description() const {
     std::string descr("opVop[");
-    if (quaternionic_index() == 0)
+    if (quaternion_index() == 0)
       descr += "0";
-    else if (quaternionic_index() == 1)
+    else if (quaternion_index() == 1)
       descr += "X";
-    else if (quaternionic_index() == 2)
+    else if (quaternion_index() == 2)
       descr += "Y";
-    else if (quaternionic_index() == 3)
+    else if (quaternion_index() == 3)
       descr += "Z";
     else
       abort();
@@ -315,13 +314,52 @@ struct σpVσp_Descr : public Contractable<σpVσp_Descr> {
   int psymm(int i, int j) const { abort(); }
   int hermitian(int i) const { return +1; }
 
-  int quaternionic_index() const { return quaternionic_index_; }
+  int quaternion_index() const { return quaternion_index_; }
 
  private:
-  const int quaternionic_index_ = -1;
+  const int quaternion_index_ = -1;
 };
 typedef GenOper<σpVσp_Descr> σpVσpOper;
 
+/** opRop: (μ σ·p | r | σ·p ν), one-body σ·p-on-both-sides analog of dipole.
+ *  σ_a σ_b = δ_ab + iε_abc σ_c folds the 9 raw σ_a∂_a r_k σ_b∂_b dyadics per
+ *  dipole direction k down to 4 Pauli-quaternion components (trace + 3
+ *  antisym), mirroring σpVσp's fold of σ·p V σ·p. 12 outputs total = 3 dipole
+ *  directions × 4 Pauli components, indexed composite_index = 4·k + q.
+ */
+struct σpRσp_Descr : public Contractable<σpRσp_Descr> {
+  typedef MultiplicativeODep1Body_Props Properties;
+
+  σpRσp_Descr() : composite_index_(0) {}
+  σpRσp_Descr(int composite_index) : composite_index_(composite_index) {
+    assert(composite_index >= 0 && composite_index < 12);
+  }
+
+  static const unsigned int max_key = 12;
+  unsigned int key() const { return composite_index(); }
+  std::string description() const {
+    static const char* dipole_lbl[] = {"x", "y", "z"};
+    static const char* pauli_lbl[] = {"0", "X", "Y", "Z"};
+    const auto ci = composite_index();
+    if (ci < 0 || ci >= 12) abort();
+    return std::string("opRop[") + dipole_lbl[ci / 4] + "," +
+           pauli_lbl[ci % 4] + "]";
+  }
+  std::string label() const { return description(); }
+  int psymm(int i, int j) const { abort(); }
+  int hermitian(int i) const { return +1; }
+
+  int composite_index() const { return composite_index_; }
+  /// dipole direction ∈ {0=x, 1=y, 2=z}
+  int dipole_dir() const { return composite_index_ / 4; }
+  /// Pauli quaternion ∈ {0=trace, 1=σ_x, 2=σ_y, 3=σ_z}
+  int quaternion_index() const { return composite_index_ % 4; }
+
+ private:
+  const int composite_index_ = -1;
+};
+typedef GenOper<σpRσp_Descr> σpRσpOper;
+
 /// cartesian multipole operator in \c NDIM dimensions
 /// \f$ \hat{O}(\vec{k}) \equiv \vec{r}^{\cdot \vec{k}} = r_1^{k_1} r_2^{k_2}
 /// \cdots \f$ \internal OriginDerivative<NDIM> is used to store cartesian
@@ -400,6 +438,146 @@ struct TwoPRep_Descr : public Contractable<TwoPRep_Descr> {
 };
 typedef GenOper<TwoPRep_Descr> TwoPRep;
 
+/** Coulombσpσp is the two-body repulsion operator.
+ */
+struct Coulombσpσp_Descr : public Contractable<Coulombσpσp_Descr> {
+  typedef MultiplicativeSymm2Body_Props Properties;
+
+  Coulombσpσp_Descr() : quaternion_index_(0) {}
+  Coulombσpσp_Descr(int quaternion_index)
+      : quaternion_index_(quaternion_index) {
+    assert(quaternion_index <= 3);
+  }
+
+  static const unsigned int max_key = 4;
+  unsigned int key() const { return quaternion_index(); }
+  std::string description() const {
+    std::string descr("coulomb_opop[");
+    if (quaternion_index() == 0)
+      descr += "0";
+    else if (quaternion_index() == 1)
+      descr += "X";
+    else if (quaternion_index() == 2)
+      descr += "Y";
+    else if (quaternion_index() == 3)
+      descr += "Z";
+    else
+      abort();
+    return descr + "]";
+  }
+  std::string label() const { return description(); }
+  int psymm(int i, int j) const { abort(); }
+  int hermitian(int i) const { return +1; }
+
+  int quaternion_index() const { return quaternion_index_; }
+
+ private:
+  const int quaternion_index_ = -1;
+};
+typedef GenOper<Coulombσpσp_Descr> CoulombσpσpOper;
+
+struct σpσpCoulombσpσp_Descr : public Contractable<σpσpCoulombσpσp_Descr> {
+  typedef MultiplicativeSymm2Body_Props Properties;
+
+  σpσpCoulombσpσp_Descr() : quaternion_index_(0) {}
+  σpσpCoulombσpσp_Descr(int quaternion_index)
+      : quaternion_index_(quaternion_index) {
+    assert(quaternion_index >= 0 && quaternion_index <= 15);
+  }
+
+  /// 16 components from tensor product of two independent spin spaces:
+  /// index = 4 * bra_spin_index + ket_spin_index
+  /// where spin indices are: 0=S (scalar), 1=X, 2=Y, 3=Z (cross product)
+  static const unsigned int max_key = 16;
+  unsigned int key() const { return quaternion_index(); }
+  std::string description() const {
+    // clang-format off
+    // Option A (tensor product order): index = 4 * bra_spin + ket_spin
+    static const char* labels[] = {
+        "SS", "SX", "SY", "SZ",
+        "XS", "XX", "XY", "XZ",
+        "YS", "YX", "YY", "YZ",
+        "ZS", "ZX", "ZY", "ZZ"
+    };
+    // clang-format on
+    const auto qi = quaternion_index();
+    if (qi > 15) abort();
+    return std::string("opop_coulomb_opop[") + labels[qi] + "]";
+  }
+  std::string label() const { return description(); }
+  int psymm(int i, int j) const { abort(); }
+  int hermitian(int i) const { return +1; }
+
+  int quaternion_index() const { return quaternion_index_; }
+
+ private:
+  const int quaternion_index_ = -1;
+};
+typedef GenOper<σpσpCoulombσpσp_Descr> σpσpCoulombσpσpOper;
+
+/** σpCoulombσp: (μ σ·p ν | 1/r_{12} | κ σ·p λ).
+ *  Gaunt LS "bilinear" operator with one σ·p on each side.
+ *  Outputs the SO(3) irreducible decomposition of the 3×3 gradient-gradient
+ *  tensor T_{ab} = ∂_a ∂_b (μν|κλ): 1 scalar trace + 3 antisymmetric
+ *  (curl-curl) + 5 symmetric-traceless = 9 components. Same storage as the
+ *  raw dyadic, but indexed by physics-meaningful irreps so consumers do not
+ *  need to hand-build trace/antisym/sym-TL combinations at every contraction
+ *  site.
+ */
+struct σpCoulombσp_Descr : public Contractable<σpCoulombσp_Descr> {
+  typedef MultiplicativeSymm2Body_Props Properties;
+
+  /// SO(3) irreducible components of the rank-2 Cartesian tensor T_{ab}.
+  enum Component : int {
+    Scalar = 0,      ///< T_xx + T_yy + T_zz (trace, ∇·∇)
+    AntisymX = 1,    ///< T_yz − T_zy = (∇×∇)_x
+    AntisymY = 2,    ///< T_zx − T_xz = (∇×∇)_y
+    AntisymZ = 3,    ///< T_xy − T_yx = (∇×∇)_z
+    SymTLDiagA = 4,  ///< T_xx − T_yy
+    SymTLDiagB = 5,  ///< 2·T_zz − T_xx − T_yy
+    SymTLOffXY = 6,  ///< T_xy + T_yx
+    SymTLOffXZ = 7,  ///< T_xz + T_zx
+    SymTLOffYZ = 8,  ///< T_yz + T_zy
+  };
+
+  σpCoulombσp_Descr() : component_index_(0) {}
+  σpCoulombσp_Descr(int component_index) : component_index_(component_index) {
+    assert(component_index >= 0 && component_index <= 8);
+  }
+
+  static const unsigned int max_key = 9;
+  unsigned int key() const { return component_index(); }
+  std::string description() const {
+    static const char* labels[] = {
+        "scalar",       "antisym_x",    "antisym_y",
+        "antisym_z",    "symtl_diag_a", "symtl_diag_b",
+        "symtl_off_xy", "symtl_off_xz", "symtl_off_yz",
+    };
+    const auto ci = component_index();
+    if (ci > 8) abort();
+    return std::string("op_coulomb_op[") + labels[ci] + "]";
+  }
+  std::string label() const { return description(); }
+  int psymm(int i, int j) const { abort(); }
+  int hermitian(int i) const { return +1; }
+
+  int component_index() const { return component_index_; }
+
+  bool is_scalar() const { return component_index_ == Scalar; }
+  bool is_antisym() const {
+    return component_index_ >= AntisymX && component_index_ <= AntisymZ;
+  }
+  bool is_sym_tl() const {
+    return component_index_ >= SymTLDiagA && component_index_ <= SymTLOffYZ;
+  }
+  /// for antisym components: 0=x, 1=y, 2=z (only valid if is_antisym())
+  int antisym_cart() const { return component_index_ - AntisymX; }
+
+ private:
+  const int component_index_ = -1;
+};
+typedef GenOper<σpCoulombσp_Descr> σpCoulombσpOper;
+
 /** GTG_1d is the two-body 1-dimensional Gaussian geminal
  */
 struct GTG_1d_Descr : public Contractable<GTG_1d_Descr> {
diff --git a/src/bin/libint/rr.cc b/src/bin/libint/rr.cc
index 32bbb9df9..a653bf4b2 100644
--- a/src/bin/libint/rr.cc
+++ b/src/bin/libint/rr.cc
@@ -127,8 +127,11 @@ void RecurrenceRelation::generate_code(
   // Assign symbols for the target and source integral sets
   std::shared_ptr<CodeSymbols> symbols(new CodeSymbols);
   assign_symbols_(symbols);
+  // Compute local dimensions before optimize_rr_out so that
+  // handle_trivial_nodes uses the correct dims (e.g., "lowdim" instead of "1")
+  std::shared_ptr<ImplicitDimensions> localdims = adapt_dims_(dims);
   // Traverse the graph
-  dg->optimize_rr_out(context);
+  dg->optimize_rr_out(context, localdims);
   dg->traverse();
 #if PRINT_DAG_GRAPHVIZ
   {
@@ -138,7 +141,6 @@ void RecurrenceRelation::generate_code(
 #endif
   // Generate code
   std::shared_ptr<MemoryManager> memman(new WorstFitMemoryManager());
-  std::shared_ptr<ImplicitDimensions> localdims = adapt_dims_(dims);
   dg->generate_code(context, memman, localdims, symbols, funcname, decl, def);
 
   // extract all external symbols -- these will be members of the evaluator
diff --git a/src/bin/libint/strategy.cc b/src/bin/libint/strategy.cc
index bbfc3fe21..e01f76a7d 100644
--- a/src/bin/libint/strategy.cc
+++ b/src/bin/libint/strategy.cc
@@ -70,51 +70,80 @@ struct MasterStrategy;
 #if LIBINT_SHELLQUARTET_STRATEGY == LIBINT_SHELLQUARTET_STRATEGY_A0C0
 template <>
 struct MasterStrategy<TwoPRep_11_11_sq> {
-  typedef boost::mpl::list<HRR_ab_11_TwoPRep_11_sh, HRR_cd_11_TwoPRep_11_sh,
-                           Deriv_a_11_TwoPRep_11_sh, Deriv_b_11_TwoPRep_11_sh,
-                           Deriv_c_11_TwoPRep_11_sh, Deriv_d_11_TwoPRep_11_sh,
+  typedef boost::mpl::list<
+      HRR_ab_11_TwoPRep_11_sh, HRR_cd_11_TwoPRep_11_sh,
+      DerivV2_a_11_TwoPRep_11_sh, DerivV2_b_11_TwoPRep_11_sh,
+      DerivV2_c_11_TwoPRep_11_sh, DerivV2_d_11_TwoPRep_11_sh,
 #if LIBINT_ERI_STRATEGY == 2
-                           ITR_a_11_TwoPRep_11_sh, ITR_c_11_TwoPRep_11_sh,
+      ITR_a_11_TwoPRep_11_sh, ITR_c_11_TwoPRep_11_sh,
 #endif
-                           VRR_a_11_TwoPRep_11_sh, VRR_c_11_TwoPRep_11_sh>
+      VRR_a_11_TwoPRep_11_sh, VRR_c_11_TwoPRep_11_sh>
       value;
 };
 template <>
 struct MasterStrategy<TwoPRep_11_11_int> {
-  typedef boost::mpl::list<HRR_ab_11_TwoPRep_11_int, HRR_cd_11_TwoPRep_11_int,
-                           Deriv_a_11_TwoPRep_11_int, Deriv_b_11_TwoPRep_11_int,
-                           Deriv_c_11_TwoPRep_11_int, Deriv_d_11_TwoPRep_11_int,
+  typedef boost::mpl::list<
+      HRR_ab_11_TwoPRep_11_int, HRR_cd_11_TwoPRep_11_int,
+      DerivV2_a_11_TwoPRep_11_int, DerivV2_b_11_TwoPRep_11_int,
+      DerivV2_c_11_TwoPRep_11_int, DerivV2_d_11_TwoPRep_11_int,
 #if LIBINT_ERI_STRATEGY == 2
-                           ITR_a_11_TwoPRep_11_int, ITR_c_11_TwoPRep_11_int,
+      ITR_a_11_TwoPRep_11_int, ITR_c_11_TwoPRep_11_int,
 #endif
-                           VRR_a_11_TwoPRep_11_int, VRR_c_11_TwoPRep_11_int>
+      VRR_a_11_TwoPRep_11_int, VRR_c_11_TwoPRep_11_int>
       value;
 };
 #else  // 0B0D strategy
 template <>
 struct MasterStrategy<TwoPRep_11_11_sq> {
-  typedef boost::mpl::list<HRR_ba_11_TwoPRep_11_sh, HRR_dc_11_TwoPRep_11_sh,
-                           Deriv_a_11_TwoPRep_11_sh, Deriv_b_11_TwoPRep_11_sh,
-                           Deriv_c_11_TwoPRep_11_sh, Deriv_d_11_TwoPRep_11_sh,
+  typedef boost::mpl::list<
+      HRR_ba_11_TwoPRep_11_sh, HRR_dc_11_TwoPRep_11_sh,
+      DerivV2_a_11_TwoPRep_11_sh, DerivV2_b_11_TwoPRep_11_sh,
+      DerivV2_c_11_TwoPRep_11_sh, DerivV2_d_11_TwoPRep_11_sh,
 #if LIBINT_ERI_STRATEGY == 2
-                           ITR_b_11_TwoPRep_11_sh, ITR_d_11_TwoPRep_11_sh,
+      ITR_b_11_TwoPRep_11_sh, ITR_d_11_TwoPRep_11_sh,
 #endif
-                           VRR_b_11_TwoPRep_11_sh, VRR_d_11_TwoPRep_11_sh>
+      VRR_b_11_TwoPRep_11_sh, VRR_d_11_TwoPRep_11_sh>
       value;
 };
 template <>
 struct MasterStrategy<TwoPRep_11_11_int> {
-  typedef boost::mpl::list<HRR_ba_11_TwoPRep_11_int, HRR_dc_11_TwoPRep_11_int,
-                           Deriv_a_11_TwoPRep_11_int, Deriv_b_11_TwoPRep_11_int,
-                           Deriv_c_11_TwoPRep_11_int, Deriv_d_11_TwoPRep_11_int,
+  typedef boost::mpl::list<
+      HRR_ba_11_TwoPRep_11_int, HRR_dc_11_TwoPRep_11_int,
+      DerivV2_a_11_TwoPRep_11_int, DerivV2_b_11_TwoPRep_11_int,
+      DerivV2_c_11_TwoPRep_11_int, DerivV2_d_11_TwoPRep_11_int,
 #if LIBINT_ERI_STRATEGY == 2
-                           ITR_b_11_TwoPRep_11_int, ITR_d_11_TwoPRep_11_int,
+      ITR_b_11_TwoPRep_11_int, ITR_d_11_TwoPRep_11_int,
 #endif
-                           VRR_b_11_TwoPRep_11_int, VRR_d_11_TwoPRep_11_int>
+      VRR_b_11_TwoPRep_11_int, VRR_d_11_TwoPRep_11_int>
       value;
 };
 #endif
 
+template <>
+struct MasterStrategy<Coulombσpσp_11_11_sq> {
+  typedef boost::mpl::list<CR_11_Coulombσpσp_11_sh> value;
+};
+template <>
+struct MasterStrategy<Coulombσpσp_11_11_int> {
+  typedef boost::mpl::list<CR_11_Coulombσpσp_11_int> value;
+};
+template <>
+struct MasterStrategy<σpσpCoulombσpσp_11_11_sq> {
+  typedef boost::mpl::list<CR_11_σpσpCoulombσpσp_11_sh> value;
+};
+template <>
+struct MasterStrategy<σpσpCoulombσpσp_11_11_int> {
+  typedef boost::mpl::list<CR_11_σpσpCoulombσpσp_11_int> value;
+};
+template <>
+struct MasterStrategy<σpCoulombσp_11_11_sq> {
+  typedef boost::mpl::list<CR_11_σpCoulombσp_11_sh> value;
+};
+template <>
+struct MasterStrategy<σpCoulombσp_11_11_int> {
+  typedef boost::mpl::list<CR_11_σpCoulombσp_11_int> value;
+};
+
 #if LIBINT_SHELLQUARTET_STRATEGY == LIBINT_SHELLQUARTET_STRATEGY_A0C0
 template <>
 struct MasterStrategy<R12kG12_11_11_sq> {
@@ -197,21 +226,31 @@ struct MasterStrategy<DivG12prime_xTx_11_11_int> {
 #if LIBINT_SHELLQUARTET_STRATEGY == LIBINT_SHELLQUARTET_STRATEGY_A0C0
 template <>
 struct MasterStrategy<DummySymmIntegral_11_11_sq> {
-  typedef boost::mpl::list<HRR_ab_11_Dummy_11_sh, HRR_cd_11_Dummy_11_sh> value;
+  typedef boost::mpl::list<HRR_ab_11_Dummy_11_sh, HRR_cd_11_Dummy_11_sh,
+                           DerivV2_a_11_Dummy_11_sh, DerivV2_b_11_Dummy_11_sh,
+                           DerivV2_c_11_Dummy_11_sh, DerivV2_d_11_Dummy_11_sh>
+      value;
 };
 template <>
 struct MasterStrategy<DummySymmIntegral_11_11_int> {
-  typedef boost::mpl::list<HRR_ab_11_Dummy_11_int, HRR_cd_11_Dummy_11_int>
+  typedef boost::mpl::list<HRR_ab_11_Dummy_11_int, HRR_cd_11_Dummy_11_int,
+                           DerivV2_a_11_Dummy_11_int, DerivV2_b_11_Dummy_11_int,
+                           DerivV2_c_11_Dummy_11_int, DerivV2_d_11_Dummy_11_int>
       value;
 };
 #else  // 0B0D strategy
 template <>
 struct MasterStrategy<DummySymmIntegral_11_11_sq> {
-  typedef boost::mpl::list<HRR_ba_11_Dummy_11_sh, HRR_dc_11_Dummy_11_sh> value;
+  typedef boost::mpl::list<HRR_ba_11_Dummy_11_sh, HRR_dc_11_Dummy_11_sh,
+                           DerivV2_a_11_Dummy_11_sh, DerivV2_b_11_Dummy_11_sh,
+                           DerivV2_c_11_Dummy_11_sh, DerivV2_d_11_Dummy_11_sh>
+      value;
 };
 template <>
 struct MasterStrategy<DummySymmIntegral_11_11_int> {
-  typedef boost::mpl::list<HRR_ba_11_Dummy_11_int, HRR_dc_11_Dummy_11_int>
+  typedef boost::mpl::list<HRR_ba_11_Dummy_11_int, HRR_dc_11_Dummy_11_int,
+                           DerivV2_a_11_Dummy_11_int, DerivV2_b_11_Dummy_11_int,
+                           DerivV2_c_11_Dummy_11_int, DerivV2_d_11_Dummy_11_int>
       value;
 };
 #endif
@@ -276,6 +315,14 @@ struct MasterStrategy<σpVσp_1_1_int> {
   typedef boost::mpl::list<CR_1_σpVσp_1_int> value;
 };
 template <>
+struct MasterStrategy<σpRσp_1_1_sh> {
+  typedef boost::mpl::list<CR_1_σpRσp_1_sh> value;
+};
+template <>
+struct MasterStrategy<σpRσp_1_1_int> {
+  typedef boost::mpl::list<CR_1_σpRσp_1_int> value;
+};
+template <>
 struct MasterStrategy<Kinetic_1_1_sh> {
   typedef boost::mpl::list<CR_XYZ_1_1<CGShell, KineticOper>> value;
 };