diff --git a/.cirrus.yml b/.cirrus.yml index da32366737..1b7fee6c6e 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -58,8 +58,8 @@ task: - export VALID_ARCHS="i386 x86_64" - xcrun --sdk macosx --show-sdk-path - xcodebuild -version - - export CC=/Applications/Xcode_16.3.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang - - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_16.3.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX15.4.sdk -arch x86_64" + - export CC=/Applications/Xcode_26.0.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_26.0.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX26.0.sdk -arch x86_64" - make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" always: config_artifacts: @@ -78,8 +78,8 @@ task: - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" - - export CC=/Applications/Xcode_16.3.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang - - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_16.3.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS18.4.sdk -arch arm64 -miphoneos-version-min=10.0" + - export CC=/Applications/Xcode_26.0.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_26.0.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS26.0.sdk -arch arm64 -miphoneos-version-min=10.0" - xcrun --sdk iphoneos --show-sdk-path - ls -l /Applications - make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1 @@ -127,7 +127,7 @@ task: FreeBSD_task: name: FreeBSD-gcc freebsd_instance: - image_family: freebsd-14-2 + image_family: freebsd-14-3 install_script: - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc compile_script: @@ -138,7 +138,7 @@ FreeBSD_task: FreeBSD_task: name: freebsd-gcc-ilp64 freebsd_instance: - image_family: freebsd-14-2 + image_family: freebsd-14-3 install_script: - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc compile_script: @@ -148,7 +148,7 @@ FreeBSD_task: FreeBSD_task: name: FreeBSD-clang-openmp freebsd_instance: - image_family: freebsd-14-2 + image_family: freebsd-14-3 install_script: - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc - ln -s /usr/local/lib/gcc13/libgfortran.so.5.0.0 /usr/lib/libgfortran.so diff --git a/.github/workflows/apple_m.yml b/.github/workflows/apple_m.yml index 81952dabd7..b6f13570c9 100644 --- a/.github/workflows/apple_m.yml +++ b/.github/workflows/apple_m.yml @@ -44,7 +44,7 @@ jobs: elif [ "$RUNNER_OS" == "macOS" ]; then # It looks like "gfortran" isn't working correctly unless "gcc" is re-installed. brew reinstall gcc - brew install coreutils cmake ccache + brew install coreutils ccache brew install llvm else echo "::error::$RUNNER_OS not supported" @@ -87,10 +87,16 @@ jobs: echo "max_size = 300M" > ~/.ccache/ccache.conf echo "compression = true" >> ~/.ccache/ccache.conf ccache -s + + - name: Add gfortran runtime to link path + if: matrix.build == 'make' && runner.os == 'macOS' + run: | + GFORTRAN_LIBDIR=$(gfortran -print-file-name=libgfortran.dylib | xargs dirname) + echo "Using gfortran runtime in $GFORTRAN_LIBDIR" + echo "LDFLAGS=-L/opt/homebrew/opt/llvm/lib -L$GFORTRAN_LIBDIR" >> $GITHUB_ENV - name: Build OpenBLAS run: | - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" export CC="/opt/homebrew/opt/llvm/bin/clang" case "${{ matrix.build }}" in diff --git a/.github/workflows/arm64_graviton.yml b/.github/workflows/arm64_graviton.yml index 6928312b56..4b4e151672 100644 --- a/.github/workflows/arm64_graviton.yml +++ b/.github/workflows/arm64_graviton.yml @@ -88,13 +88,14 @@ jobs: run: | case "${{ matrix.build }}" in "make") - make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC="ccache ${{ matrix.fortran }}" + make -j$(nproc) DYNAMIC_ARCH=1 BUILD_BFLOAT16=1 USE_OPENMP=0 FC="ccache ${{ matrix.fortran }}" ;; "cmake") mkdir build && cd build cmake -DDYNAMIC_ARCH=1 \ -DNOFORTRAN=0 \ -DBUILD_WITHOUT_LAPACK=0 \ + -DBUILD_BFLOAT16=1 \ -DCMAKE_VERBOSE_MAKEFILE=ON \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \ diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index 2d5c7b6123..3699200465 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -1,6 +1,6 @@ name: continuous build -on: [push, pull_request] +on: [push, pull_request, workflow_dispatch] concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -11,17 +11,24 @@ permissions: jobs: build: - if: "github.repository == 'OpenMathLib/OpenBLAS'" + if: "github.repository == 'OpenMathLib/OpenBLAS' || github.event_name == 'workflow_dispatch'" runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: - os: [ubuntu-latest, macos-latest] + os: [ubuntu-latest, macos-latest, ubuntu-24.04-arm] + cc: [gcc, clang, clang-21] fortran: [gfortran, flang] build: [cmake, make] exclude: - os: macos-latest + cc: gcc + - os: macos-latest + cc: clang-21 + - os: macos-latest + fortran: flang + - os: ubuntu-24.04-arm fortran: flang steps: @@ -42,14 +49,27 @@ jobs: - name: Install Dependencies run: | if [ "$RUNNER_OS" == "Linux" ]; then + cat << EOF | sudo tee -a /etc/apt/apt.conf.d/01norecommend + APT::Install-Recommends "0"; + APT::Install-Suggests "0"; + EOF sudo apt-get update - sudo apt-get install -y gfortran cmake ccache - wget http://security.ubuntu.com/ubuntu/pool/universe/n/ncurses/libtinfo5_6.3-2ubuntu0.1_amd64.deb - sudo apt install ./libtinfo5_6.3-2ubuntu0.1_amd64.deb + sudo apt-get install -y ccache + if [ "${{ matrix.cc }}" == "clang-21" ]; then + wget https://apt.llvm.org/llvm.sh + chmod +x llvm.sh + sudo ./llvm.sh 21 + fi + if [ "${{ matrix.fortran }}" == "flang" ]; then + wget http://security.ubuntu.com/ubuntu/pool/universe/n/ncurses/libtinfo5_6.3-2ubuntu0.1_amd64.deb + sudo apt install ./libtinfo5_6.3-2ubuntu0.1_amd64.deb + else + sudo apt-get install -y ${{ matrix.fortran }} + fi elif [ "$RUNNER_OS" == "macOS" ]; then # It looks like "gfortran" isn't working correctly unless "gcc" is re-installed. brew reinstall gcc - brew install coreutils cmake ccache + brew install coreutils ccache else echo "::error::$RUNNER_OS not supported" exit 1 @@ -64,12 +84,12 @@ jobs: # GNU make and cmake call the compilers differently. It looks like # that causes the cache to mismatch. Keep the ccache for both build # tools separate to avoid polluting each other. - key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }} + key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build }}-${{ matrix.cc }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }} # Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler. restore-keys: | - ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }} - ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }} - ccache-${{ runner.os }}-${{ matrix.build }} + ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build }}-${{ matrix.cc }}-${{ matrix.fortran }}-${{ github.ref }} + ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build }}-${{ matrix.cc }}-${{ matrix.fortran }} + ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build }}-${{ matrix.cc }} - name: Configure ccache run: | @@ -90,6 +110,14 @@ jobs: echo "compression = true" >> ~/.ccache/ccache.conf ccache -s + - name: Add gfortran runtime to link path + if: matrix.build == 'make' && runner.os == 'macOS' + run: | + GFORTRAN_LIBDIR=$(gfortran -print-file-name=libgfortran.dylib | xargs dirname) + echo "Using gfortran runtime in $GFORTRAN_LIBDIR" + # Preserve whatever LDFLAGS may already contain + echo "LDFLAGS=${LDFLAGS:+$LDFLAGS }-L$GFORTRAN_LIBDIR" >> "$GITHUB_ENV" + - name: Build OpenBLAS run: | if [ "${{ matrix.fortran }}" = "flang" ]; then @@ -102,7 +130,7 @@ jobs: fi case "${{ matrix.build }}" in "make") - make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC="ccache ${{ matrix.fortran }}" + make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 CC="ccache ${{ matrix.cc }}" FC="ccache ${{ matrix.fortran }}" ;; "cmake") mkdir build && cd build @@ -111,6 +139,7 @@ jobs: -DBUILD_WITHOUT_LAPACK=0 \ -DCMAKE_VERBOSE_MAKEFILE=ON \ -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_COMPILER=${{ matrix.cc }} \ -DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \ -DCMAKE_C_COMPILER_LAUNCHER=ccache \ -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \ @@ -134,13 +163,13 @@ jobs: "make") MAKE_FLAGS='DYNAMIC_ARCH=1 USE_OPENMP=0' echo "::group::Tests in 'test' directory" - make -C test $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}" + make -C test $MAKE_FLAGS CC="ccache ${{ matrix.cc }}" FC="ccache ${{ matrix.fortran }}" echo "::endgroup::" echo "::group::Tests in 'ctest' directory" - make -C ctest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}" + make -C ctest $MAKE_FLAGS CC="ccache ${{ matrix.cc }}" FC="ccache ${{ matrix.fortran }}" echo "::endgroup::" echo "::group::Tests in 'utest' directory" - make -C utest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}" + make -C utest $MAKE_FLAGS CC="ccache ${{ matrix.cc }}" FC="ccache ${{ matrix.fortran }}" echo "::endgroup::" ;; "cmake") @@ -364,15 +393,15 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v3 - + - name: Install Dependencies run: | sudo apt-get update sudo apt-get install -y gcc gfortran make - + - name: Build OpenBLAS run: | - make -j${nproc} + make -j${nproc} make -j${nproc} lapack-test - - + + diff --git a/.github/workflows/loongarch64_clang.yml b/.github/workflows/loongarch64_clang.yml index fdb48309b9..a42a2cd651 100644 --- a/.github/workflows/loongarch64_clang.yml +++ b/.github/workflows/loongarch64_clang.yml @@ -35,7 +35,7 @@ jobs: - name: Install libffi6 run: | - wget http://ftp.ca.debian.org/debian/pool/main/libf/libffi/libffi6_3.2.1-9_amd64.deb + wget https://download.nvidia.com/cumulus/apt.cumulusnetworks.com/pool/upstream/libf/libffi/libffi6_3.2.1-9_amd64.deb sudo dpkg -i libffi6_3.2.1-9_amd64.deb - name: Install APT deps diff --git a/.github/workflows/riscv64_vector.yml b/.github/workflows/riscv64_vector.yml index 9209ebb7d1..474e51bb48 100644 --- a/.github/workflows/riscv64_vector.yml +++ b/.github/workflows/riscv64_vector.yml @@ -16,8 +16,8 @@ jobs: env: triple: riscv64-unknown-linux-gnu riscv_gnu_toolchain: https://github.com/riscv-collab/riscv-gnu-toolchain - riscv_gnu_toolchain_version: 13.2.0 - riscv_gnu_toolchain_nightly_download_path: /releases/download/2024.02.02/riscv64-glibc-ubuntu-22.04-llvm-nightly-2024.02.02-nightly.tar.gz + riscv_gnu_toolchain_version: 15.1.0 + riscv_gnu_toolchain_nightly_download_path: /releases/download/2025.08.29/riscv64-glibc-ubuntu-22.04-llvm-nightly-2025.08.29-nightly.tar.xz strategy: fail-fast: false matrix: @@ -26,8 +26,8 @@ jobs: opts: TARGET=RISCV64_ZVL128B BINARY=64 ARCH=riscv64 qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=128,elen=64 - target: RISCV64_ZVL256B - opts: TARGET=RISCV64_ZVL256B BINARY=64 ARCH=riscv64 - qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64 + opts: TARGET=RISCV64_ZVL256B BINARY=64 ARCH=riscv64 BUILD_BFLOAT16=1 BUILD_HFLOAT16=1 + qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64,zfh=true,zvfh=true,zvfbfwma=true - target: DYNAMIC_ARCH=1 opts: TARGET=RISCV64_GENERIC BINARY=64 ARCH=riscv64 DYNAMIC_ARCH=1 qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64 @@ -40,10 +40,13 @@ jobs: run: | sudo apt-get update sudo apt-get install autoconf automake autotools-dev ninja-build make \ - libgomp1-riscv64-cross ccache + libgomp1-riscv64-cross ccache qemu-kvm qemu-user libc6-riscv64-cross wget ${riscv_gnu_toolchain}/${riscv_gnu_toolchain_nightly_download_path} tar -xvf $(basename ${riscv_gnu_toolchain_nightly_download_path}) -C /opt - + wget https://gist.github.com/martin-frbg/bb630e0de34978e578eeb496b1538d4e/raw/7fd8d971f327f7a517b8f5f7989479ff2b36f71f/qemu-riscv64-10.1-ubuntu24 -P /opt/riscv/bin -o riscv64-qemu + mv /opt/riscv/bin/qemu-riscv64-10.1-ubuntu24 /opt/riscv/bin/qemu-riscv64 + chmod +x /opt/riscv/bin/qemu-riscv64 + - name: Compilation cache uses: actions/cache@v3 with: @@ -74,7 +77,7 @@ jobs: run: | export PATH="/opt/riscv/bin:$PATH" make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \ - CC='${triple}-gcc' \ + CC='ccache clang --rtlib=compiler-rt -target ${triple} --sysroot /opt/riscv/sysroot --gcc-toolchain=/opt/riscv/lib/gcc/riscv64-unknown-linux-gnu/${riscv_gnu_toolchain_version}/' \ AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \ RANLIB='ccache ${triple}-ranlib' \ FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \ @@ -98,6 +101,8 @@ jobs: shell: bash run: | export PATH="/opt/riscv/bin:$PATH" + export LD_LIBRARY_PATH=/opt/riscv/sysroot/lib + sudo ln -s /opt/riscv/sysroot/lib/ld-linux-riscv64-lp64d.so.1 /lib export QEMU_CPU=${{ matrix.qemu_cpu }} rm -rf ./test_out mkdir -p ./test_out @@ -134,6 +139,14 @@ jobs: wait while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ; done < <(grep -lZ FAIL ./test_out/*) if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi + if [ "${{matrix.target}}" == "RISCV64_ZVL256B" ]; then + qemu-riscv64 test/test_sbgemm & + qemu-riscv64 test/test_sbgemv & + qemu-riscv64 test/test_shgemm & + qemu-riscv64 test/test_shgemv & + qemu-riscv64 test/test_bgemm + fi + - name: netlib tests shell: bash diff --git a/.github/workflows/windows_arm64.yml b/.github/workflows/windows_arm64.yml new file mode 100644 index 0000000000..21043ec251 --- /dev/null +++ b/.github/workflows/windows_arm64.yml @@ -0,0 +1,84 @@ +name: Windows ARM64 CI + +on: + push: + branches: + - develop + pull_request: + branches: + - develop + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +permissions: + contents: read # to fetch code (actions/checkout) + +jobs: + build: + if: "github.repository == 'OpenMathLib/OpenBLAS'" + runs-on: windows-11-arm + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Install LLVM for Win-ARM64 + shell: pwsh + run: | + Invoke-WebRequest https://github.com/llvm/llvm-project/releases/download/llvmorg-20.1.8/LLVM-20.1.8-woa64.exe -UseBasicParsing -OutFile LLVM-woa64.exe + Start-Process -FilePath ".\LLVM-woa64.exe" -ArgumentList "/S" -Wait + echo "C:\Program Files\LLVM\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + #dir "C:\Program Files\LLVM\include\flang" + #rmdir /Q /S "C:/Program Files/Microsoft Visual Studio/2022/Enterprise/VC/Tools/Llvm/ARM64" + + - name: Install CMake and Ninja for Win-ARM64 + shell: pwsh + run: | + Invoke-WebRequest https://github.com/Kitware/CMake/releases/download/v3.29.4/cmake-3.29.4-windows-arm64.msi -OutFile cmake-arm64.msi + Start-Process msiexec.exe -ArgumentList "/i cmake-arm64.msi /quiet /norestart" -Wait + echo "C:\Program Files\CMake\bin" >> $env:GITHUB_PATH + + Invoke-WebRequest https://github.com/ninja-build/ninja/releases/download/v1.13.1/ninja-winarm64.zip -OutFile ninja-winarm64.zip + Expand-Archive ninja-winarm64.zip -DestinationPath ninja + Copy-Item ninja\ninja.exe -Destination "C:\Windows\System32" + + - name: Configure OpenBLAS + shell: cmd + run: | + CALL "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsarm64.bat" + set PATH=C:\Program Files\LLVM\bin;%PATH% + + mkdir build + cd build + cmake .. -G Ninja ^ + -DCMAKE_BUILD_TYPE=Release ^ + -DTARGET=ARMV8 ^ + -DBINARY=64 ^ + -DCMAKE_C_COMPILER=clang-cl ^ + -DCMAKE_Fortran_COMPILER=flang-new ^ + -DBUILD_SHARED_LIBS=ON ^ + -DCMAKE_SYSTEM_PROCESSOR=arm64 ^ + -DCMAKE_SYSTEM_NAME=Windows ^ + -DCMAKE_INSTALL_PREFIX=C:/opt + + - name: Build OpenBLAS + shell: cmd + run: | + cd build + ninja -j16 + + - name: Install OpenBLAS + shell: cmd + run: | + cd build + cmake --install . + + - name: Run ctests + shell: pwsh + run: | + $env:PATH = "C:\opt\bin;$env:PATH" + cd build + ctest + + diff --git a/.gitignore b/.gitignore index 8294da4d44..a6ee824681 100644 --- a/.gitignore +++ b/.gitignore @@ -13,8 +13,8 @@ lapack-3.4.1.tgz lapack-3.4.2 lapack-3.4.2.tgz lapack-netlib/make.inc -lapack-netlib/lapacke/include/lapacke_mangling.h lapack-netlib/SRC/la_constants.mod +lapack-netlib/SRC/la_xisnan.mod lapack-netlib/TESTING/testing_results.txt lapack-netlib/INSTALL/test* lapack-netlib/TESTING/xeigtstc @@ -80,8 +80,12 @@ test/SBLAT3_3M.SUMM test/ZBLAT2.SUMM test/ZBLAT3.SUMM test/ZBLAT3_3M.SUMM +test/SHBLAT2.SUMM test/SHBLAT3.SUMM +test/SBBLAT2.SUMM test/SBBLAT3.SUMM +test/BBLAT2.SUMM +test/BBLAT3.SUMM test/cblat1 test/cblat2 test/cblat3 @@ -95,7 +99,11 @@ test/sblat2 test/sblat3 test/sblat3_3m test/test_shgemm +test/test_shgemv test/test_sbgemm +test/test_sbgemv +test/test_bgemm +test/test_bgemv test/zblat1 test/zblat2 test/zblat3 diff --git a/CMakeLists.txt b/CMakeLists.txt index 3f9af642da..1fa4d96780 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -152,6 +152,9 @@ endif () if (NOT DEFINED BUILD_BFLOAT16) set (BUILD_BFLOAT16 false) endif () +if (NOT DEFINED BUILD_HFLOAT16) + set (BUILD_HFLOAT16 false) +endif () # set which float types we want to build for if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16) # if none are defined, build for all @@ -305,8 +308,8 @@ if (USE_OPENMP) endif() endif() -# Fix "Argument list too long" for macOS with Intel CPUs and DYNAMIC_ARCH turned on -if(APPLE AND DYNAMIC_ARCH AND (NOT CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "arm64")) +# Fix "Argument list too long" for macOS - mostly seen with older OS versions on POWERPC or Intel CPUs +if(APPLE) # Use response files set(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) # Always build static library first @@ -495,6 +498,11 @@ if (BUILD_SHARED_LIBS OR DELETE_STATIC_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFI else () set (BBF16 0) endif() + if (${BUILD_HFLOAT16}) + set (BHF16 1) + else () + set (BHF16 0) + endif() if (${BUILD_SINGLE}) set (BS 1) else () @@ -530,7 +538,7 @@ endif() #if (USE_PERL) message(STATUS "adding postbuild instruction to rename syms") add_custom_command(TARGET ${OpenBLAS_LIBNAME}_static POST_BUILD - COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "win2k" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/renamesyms.def + COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "win2k" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" "${BLD}" "${BBF16}" "${BHF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/renamesyms.def COMMAND ${CMAKE_C_COMPILER} ${CMAKE_C_FLAGS} -I${PROJECT_SOURCE_DIR} -I${PROJECT_BINARY_DIR} -c -o ${PROJECT_BINARY_DIR}/dllinit.o ${PROJECT_SOURCE_DIR}/exports/dllinit.c COMMAND lld-link -nodefaultlib:libcmt -defaultlib:${CRTLIB} ${CMAKE_LINKER_FLAGS} -errorlimit:0 -def:${PROJECT_BINARY_DIR}/renamesyms.def ${PROJECT_BINARY_DIR}/dllinit.o $ -wholearchive:$ -dll -out:$/${OpenBLAS_LIBNAME}.dll -implib:$/${OpenBLAS_LIBNAME}.dll.a ${PDBOPT} #COMMAND lld-link -nodefaultlib:libcmt -defaultlib:msvcrt ${CMAKE_LINKER_FLAGS} -errorlimit:0 -def:${PROJECT_BINARY_DIR}/renamesyms.def ${PROJECT_BINARY_DIR}/dllinit.o $ -wholearchive:$ -dll -out:$/${OpenBLAS_LIBNAME}.dll -implib:$/${OpenBLAS_LIBNAME}.dll.a @@ -540,14 +548,14 @@ message(STATUS "adding postbuild instruction to rename syms") else () if (NOT USE_PERL) add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD - COMMAND sh ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def - COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/${OpenBLAS_LIBNAME}.so + COMMAND sh ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BHF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def + COMMAND objcopy --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/${OpenBLAS_LIBNAME}.so COMMENT "renaming symbols" ) else() add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD - COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def - COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so + COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BHF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def + COMMAND objcopy --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so COMMENT "renaming symbols" ) endif() @@ -700,6 +708,39 @@ if(NOT NO_LAPACKE) COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h" ) install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + if (NOT (x${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "x")) + message (STATUS "Generating lapacke.h in ${CMAKE_INSTALL_INCLUDEDIR}") + set(LAPACKE_H ${CMAKE_BINARY_DIR}/generated/lapacke.h) + file(READ ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke.h LAPACKE_H_CONTENTS) + if (NOT ${SYMBOLPREFIX} STREQUAL "") + string(REGEX REPLACE "(LAPACKE_*)" " ${SYMBOLPREFIX}\\1" LAPACKE_H_CONTENTS_NEW "${LAPACKE_H_CONTENTS}") + string(REPLACE "_ ${SYMBOLPREFIX}LAPACKE_H_" "_LAPACKE_H_" LAPACKE_H_CONTENTS ${LAPACKE_H_CONTENTS_NEW}) + string(REPLACE "${SYMBOLPREFIX}LAPACKE_malloc" "LAPACKE_malloc" LAPACKE_H_CONTENTS_NEW ${LAPACKE_H_CONTENTS}) + string(REPLACE "${SYMBOLPREFIX}LAPACKE_free" "LAPACKE_free" LAPACKE_H_CONTENTS ${LAPACKE_H_CONTENTS_NEW}) + set(LAPACKE_H_CONTENTS_NEW ${LAPACKE_H_CONTENTS}) + endif() + if (NOT ${SYMBOLSUFFIX} STREQUAL "") + string(REGEX REPLACE "(${SYMBOLPREFIX}LAPACKE_[a-z1-9]*[^ (]*)" "\\1${SYMBOLSUFFIX}" LAPACKE_H_CONTENTS_NEW "${LAPACKE_H_CONTENTS}") + string(REPLACE "#define${SYMBOLSUFFIX}" "#define" LAPACKE_H_CONTENTS ${LAPACKE_H_CONTENTS_NEW}) + string(REPLACE "LAPACKE_malloc${SYMBOLSUFFIX}" "LAPACKE_malloc" LAPACKE_H_CONTENTS_NEW ${LAPACKE_H_CONTENTS}) + string(REPLACE "LAPACKE_free${SYMBOLSUFFIX}" "LAPACKE_free" LAPACKE_H_CONTENTS ${LAPACKE_H_CONTENTS_NEW}) + set(LAPACKE_H_CONTENTS_NEW ${LAPACKE_H_CONTENTS}) + endif() + file(WRITE ${LAPACKE_H} "${LAPACKE_H_CONTENTS_NEW}") + install (FILES ${LAPACKE_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + message (STATUS "Generating lapack.h in ${CMAKE_INSTALL_INCLUDEDIR}") + set(LAPACK_H ${CMAKE_BINARY_DIR}/generated/lapack.h) + file(READ ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapack.h LAPACK_H_CONTENTS) + if (NOT ${SYMBOLPREFIX} STREQUAL "") + string(REGEX REPLACE "(LAPACK_[a-z1-9]*[ \(][.\)]*)" "${SYMBOLPREFIX}\\1" LAPACK_H_CONTENTS_NEW "${LAPACK_H_CONTENTS}") + set(LAPACK_H_CONTENTS ${LAPACK_H_CONTENTS_NEW}) + endif() + if (NOT ${SYMBOLSUFFIX} STREQUAL "") + string(REGEX REPLACE "(${SYMBOLPREFIX}LAPACK_[a-z1-9]*)([ \(].\)" "\\1${SYMBOLSUFFIX}\\2" LAPACK_H_CONTENTS_NEW "${LAPACK_H_CONTENTS}") + endif() + file(WRITE ${LAPACK_H} "${LAPACK_H_CONTENTS_NEW}") + install (FILES ${LAPACK_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) +endif() endif() # Install pkg-config files diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index d8f57ef60a..3dc3d73bbb 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -29,6 +29,9 @@ * Annop Wongwathanarat * Optimizations and other improvements targeting AArch64 +* Anna Mayne + * Optimizations and other improvements targeting AArch64 + ## Previous Developers * Zaheer Chothia @@ -251,6 +254,21 @@ In chronological order: * Ye Tao * [2025-02-03] Optimize SBGEMM kernel on NEOVERSEV1 * [2025-02-27] Add sbgemv_n_neon kernel + * [2025-05-17] Impl prototype of BGEMM inferface * Abhishek Kumar - * [2025-04-22] Optimise dot kernel for NEOVERSE V1 \ No newline at end of file + * [2025-04-22] Optimise dot kernel for NEOVERSE V1 + * [2025-07-23] ARM64-Enable bfloat16 kernels by default + +* Sharif Inamdar + * [2025-06-05] Optimize gemv_n_sve_v1x3 kernel + +* Guoyuan Li + * [2025-04-11] Optimise gemv kernel for RISCV64_ZVL256B + * [2025-05-01] Optimise zgemv kernel for RISCV64_ZVL256B + * [2025-05-17] Optimise omatcopy/zomatcopy kernel for RISCV64_ZVL256B + * [2025-05-29] Optimise axpby kernel for RISCV64_ZVL256B + * [2025-06-05] Optimise hbmv kernel for RISCV64_ZVL256B + +* Anna Mayne + * [2025-11-19] Update thread throttling profile for SGEMV on NEOVERSEV1 and NEOVERSEV2 diff --git a/Changelog.txt b/Changelog.txt index e4ba72986e..bc4f23535c 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,120 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.31 +15-Jan-2025 + +general: + - reverted a matrix partitioning optimization from 0.3.30 that could lead to + race conditions and subsequent invalid results in GEMM + - added the bfloat16 extensions BGEMM and BGEMV + - added a BLAS interface for the ?GEMM_BATCH extensions + - added the BLAS extensions ?GEMM_BATCH_STRIDED and their CBLAS interface + - added the basic infrastructure for half-precision float (FP16) format + using SH prefix + - reimplemented the LAPACK SLAED3/DLAED3 function using multithreading, thereby + improving the performance of the SSYEVD/DSYEVD eigensolver for symmetric matrices + on all platforms + - limited the number of retries for initial memory allocation to avoid infinite + hanging on low-memory systems + - fixed a thread lockup situation encountered with python 3.9 or older and numpy + - introduced a problem size threshold for multithreading in STRMV/DTRMV + - introduced a problem size threshold for multithreading in CHER/CHER2/CHPR/CHPR2 + and ZHER/ZHER2/ZHPR/ZHPR2 + - improved the problem size thresholds for multithreading in SGER/DGER + - improved autodetection of the Fortran compiler + - fixed passing of the INTERFACE64=1 option to the flang-new compiler + - fixed a potential deadlock in multithreaded code after calling fork() + - fixed builds using CMake on FreeBSD + - fixed builds using CMake from within Cygwin on Windows + - fixed builds using CMake and the NVHPC compiler on ARM64 + - fixed CMake build error from misdetecting compiler or OpenMP versions + - improved contents of the CMake-generated OpenBLASConfig.cmake file + - added support for cross-compilation to RISCV targets via CMake + - fixed cross-compilation to x86 targets from non-x86 architectures + - fixed failure to install cblas.h if NO_CBLAS=0 was specified + - fixed missing user-defined pre- and postfixes on functions in lapack.h,lapacke.h + - included fixes from the Reference-LAPACK project: + - fix ordering bug in ?LAED/?LASD (Reference-LAPACK PR 1140) + - revert changes in ?GEEV from PR 1129 (Reference-LAPACK PR 1142) + - fix workspace allocation in LAPACKE_?TRSEN (Reference-LAPACK PR 1144) + +riscv: + - added optimized SBGEMM kernels for ZVL128B and ZVL256B targets + - added optimized SHGEMM kernels for ZVL128B and ZVL256B targets + - added optimized SBGEMV and SHGEMV kernels for ZVL128B/ZVL256B + - improved performance of the GEMV kernel for ZVL256B + - improved the performance of the CROT and ZROT kernels for ZVL128B and x280 + - improved the detection of RVV1.0 capability + - improved performance of the matrix packing helper functions for ZVL128B and ZVL256B + - improved performance of OMATCOPY for ZVL128B and ZVL256B + +arm: + - fixed spurious executable stack in the getarch utility + +arm64: + - fixed spurious executable stack in the getarch utility + - fixed compiler warnings arising from the timer macro RPCC + - fixed cache size detection for Qualcomm Oryon under Windows on Arm + - fixed argument handling in the default SVE kernel for SDOT/DDOT + - building the BFLOAT16 kernels is now enabled by default + - improved the overall performance of GEMM,SYMM and HEMM on A64FX + - improved the performance of SDOT/DDOT on A64FX + - improved the multithreading performance of SDOT/DDOT on A64FX by + introduction of a throttling table matching thread count to problem size + - improved the performance of SGER/DGER on A64FX and NEOVERSEV1 + - improved the multithreading performance of GEMM on A64FX and NEOVERSEV1 + - improved the performance of the GEMV kernel for SVE-capable targets + - improved the multithreading performance of SGEMM on NEOVERSEV1 and V2 + - added optimized SAXPY/DAXPY SVE kernels for A64FX and NEOVERSEV1 + - added optimized BGEMM and BGEMV kernels for NEOVERSEV1 + - added an optimized BGEMM kernel for NEOVERSEN2 + - added support for the NEOVERSEV2 cpu + - added dedicated support for the Apple M4 cpu as VORTEXM4 + - added optimized SGEMM/SSYMM/STRMM/SSYRK/SSYR2K for SME-capable targets + (ARMV9SME and VORTEXM4) + - improved the precision of the SNRM2 kernel + - added cpu autodetection and compiler settings for Ampere One processors + - fixed cpu autodetection for Apple M systems running Linux + - fixed building on MacOS with AppleClang,gfortran and xcode v16 or newer + - fixed several errors in the C code replacements for the complex and double + precision complex LAPACK functions that get used (only) when compiling with + Microsoft C and NOFORTRAN=1 under MS Windows + +power: + - added initial support for the POWER11 architecture + - improved performance of DGEMM and DGEMV on POWER10 + - fixed the default compiler flags to use "-O3" instead of the possibly unsafe + "-Ofast" + - fixed building under MacOS (for old G4 Macs) with CMake + - fixed potential miscompilation of DGEMV and other assembly kernels by gcc15.1 + - fixed compilation with recent versions of flang + +loongarch64: + - fixed warnings and potential inaccuracies arising from incorrect saving of registers + - fixed enumeration of logical cores on big NUMA servers + - fixed building with LLVM and the INTERFACE64=1 option + +x86: + - fixed building the GEMM3M kernels for the GENERIC target + - fixed several errors in the C code replacements for the complex and double + precision complex LAPACK functions that get used (only) when compiling with + Microsoft C and NOFORTRAN=1 under MS Windows + +x86_64: + - added cpu autodetection for Intel Lunar Lake (Core Ultra 200V) + - changed all ?MIN and ?MAX assembly kernels to use unaligned operations + - fixed several errors in the C code replacements for the complex and double + precision complex LAPACK functions that get used (only) when compiling with + Microsoft C and NOFORTRAN=1 under MS Windows + - fixed potential crashes in builds for Cooper Lake, Sapphire Rapids or Zen5 cpus + under MS Windows + +zarch: + - added support for building with CMake + +sparc: + - fixed a potential crash in the DNRM2 kernel + ==================================================================== Version 0.3.30 19-Jun-2025 diff --git a/Jenkinsfile b/Jenkinsfile index baeeee59ff..c4d2a4147f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -10,5 +10,10 @@ pipeline { sh 'make clean && make' } } + stage('CMakeBuild') { + steps { + sh 'sudo apt update && sudo apt install cmake -y && make clean && rm -rf build && mkdir build && cd build && cmake -DDYNAMIC_ARCH=1 .. && make' + } + } } } diff --git a/Jenkinsfile.pwr b/Jenkinsfile.pwr index b2f8ce2e5d..f7917aaee2 100644 --- a/Jenkinsfile.pwr +++ b/Jenkinsfile.pwr @@ -1,16 +1,31 @@ pipeline { - agent { - docker { - image 'osuosl/ubuntu-ppc64le:18.04' - } - } + agent none stages { - stage('Build') { + stage('GCC build') { + agent { + docker { + image 'osuosl/ubuntu-ppc64le:18.04' // gcc 7, gfortran 7 + } + } steps { + checkout scm sh 'sudo apt update' sh 'sudo apt install gfortran -y' sh 'make clean && make' } } + stage('Clang build') { + agent { + docker { + image 'osuosl/ubuntu-ppc64le:20.04' // clang 10, gfortran 9 + } + } + steps { + checkout scm + sh 'sudo apt update' + sh 'sudo apt install -y clang gfortran' + sh 'make clean && make CC=clang' + } + } } } diff --git a/Makefile.arm64 b/Makefile.arm64 index 5d75eef30b..bcdda04778 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -1,3 +1,31 @@ +############################################################################### +# Copyright (c) 2025, The OpenBLAS Project +# All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# 3. Neither the name of the OpenBLAS project nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +############################################################################### + ifneq ($(C_COMPILER), PGI) ifeq ($(C_COMPILER), CLANG) @@ -33,6 +61,11 @@ endif ifeq ($(CORE), ARMV9SME) CCOMMON_OPT += -march=armv9-a+sve2+sme FCOMMON_OPT += -march=armv9-a+sve2 +ifdef OS_WINDOWS +ifeq ($(C_COMPILER), CLANG) +CCOMMON_OPT += --aarch64-stack-hazard-size=0 +endif +endif endif ifeq ($(CORE), CORTEXA53) @@ -183,10 +216,25 @@ endif # Detect ARM Neoverse V2. ifeq ($(CORE), NEOVERSEV2) +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ13) $(ISCLANG))) +CCOMMON_OPT += -mcpu=neoverse-v2 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -mcpu=neoverse-v2 +endif +else +CCOMMON_OPT += -march=armv8.2-a+sve+bf16 -mtune=neoverse-n1 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 +endif +endif +endif + +# Detect Ampere AmpereOne(ampere1,ampere1a) processors. +ifeq ($(CORE), AMPERE1) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ12) $(ISCLANG))) -CCOMMON_OPT += -march=armv9-a -mtune=neoverse-v2 +CCOMMON_OPT += -march=armv8.6-a+crypto+crc+fp16+sha3+rng ifneq ($(F_COMPILER), NAG) -FCOMMON_OPT += -march=armv9-a -mtune=neoverse-v2 +FCOMMON_OPT += -march=armv8.6-a+crypto+crc+fp16+sha3+rng endif endif endif @@ -260,6 +308,20 @@ FCOMMON_OPT += -march=armv8.3-a endif endif +ifeq ($(CORE), VORTEXM4) +ifneq ($(C_COMPILER), GCC) +ifeq ($(APPLECLANG),1) +CCOMMON_OPT += -march=armv8.4-a+sme +else +CCOMMON_OPT += -march=armv8.4-a+sme +override LDFLAGS += -lclang_rt_builtins-aarch64 +endif +else +CCOMMON_OPT += -march=armv8.4-a +endif +FCOMMON_OPT += -march=armv8.4-a +endif + ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG))) ifeq ($(CORE), TSV110) CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 diff --git a/Makefile.install b/Makefile.install index 10e6425cce..5fc6b9560c 100644 --- a/Makefile.install +++ b/Makefile.install @@ -71,7 +71,7 @@ install : lib.grd @cat common_interface.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" @echo \#endif >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" -ifndef NO_CBLAS +ifneq ($(NO_CBLAS),1) @echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @cp cblas.h cblas.tmp ifdef SYMBOLPREFIX @@ -93,9 +93,27 @@ endif ifneq ($(OSNAME), AIX) ifneq ($(NO_LAPACKE), 1) + @cp $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h lapacke_h.tmp +ifdef SYMBOLPREFIX + @sed 's/LAPACKE_[a-z1-9].[^() ]*/$(SYMBOLPREFIX)&/g' lapacke_h.tmp > lapacke.tmp2 + @mv lapacke.tmp2 lapacke_h.tmp +endif +ifdef SYMBOLSUFFIX + @sed 's/LAPACKE_[a-z1-9].[^() ]*/&$(SYMBOLSUFFIX)/g' lapacke_h.tmp > lapacke.tmp2 + @mv lapacke.tmp2 lapacke_h.tmp +endif + @-install -m644 lapacke_h.tmp "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) - @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h" - @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" + @cp $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h lapack_h.tmp +ifdef SYMBOLPREFIX + @sed 's/LAPACK_[a-z1-9]*(\.\.\.)/$(SYMBOLPREFIX)&/g' lapack_h.tmp > lapack.tmp2 + @mv lapack.tmp2 lapack_h.tmp +endif +ifdef SYMBOLSUFFIX + @sed 's/\(#define $(SYMBOLPREFIX)LAPACK_[a-z1-9].*\)\((...)\)/\1$(SYMBOLSUFFIX)\2/g' lapack_h.tmp > lapack.tmp2 + @mv lapack.tmp2 lapack_h.tmp +endif + @-install -m644 lapack_h.tmp "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h" @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" @@ -189,6 +207,8 @@ endif #Generating OpenBLASConfig.cmake +ifneq ($(origin _OpenBLAS_ROOT_DIR),"undefined") + @echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" @echo "file(REAL_PATH \"../../..\" _OpenBLAS_ROOT_DIR BASE_DIRECTORY \$${CMAKE_CURRENT_LIST_DIR} )" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" @@ -215,6 +235,27 @@ else #only static @echo "SET(OpenBLAS_LIBRARIES \$${_OpenBLAS_ROOT_DIR}/lib/$(LIBPREFIX).$(LIBSUFFIX))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif +else + echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) + @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" + @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" +ifneq ($(NO_SHARED),1) +#ifeq logical or +ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly)) + @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX)$(SYMBOLSUFFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" + endif +ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) + @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" + ifeq ($(OSNAME), Darwin) + @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" +endif +else +#only static + @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" +endif +endif +endif + #Generating OpenBLASConfigVersion.cmake @echo Generating $(OPENBLAS_CMAKE_CONFIG_VERSION) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) @echo "set (PACKAGE_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" diff --git a/Makefile.power b/Makefile.power index 3fa6d6faf8..97a466d138 100644 --- a/Makefile.power +++ b/Makefile.power @@ -13,16 +13,16 @@ ifeq ($(CORE), POWER10) ifneq ($(C_COMPILER), PGI) ifeq ($(C_COMPILER), GCC) ifeq ($(GCCVERSIONGTEQ10), 1) -CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math +CCOMMON_OPT += -O3 -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math else ifneq ($(GCCVERSIONGT4), 1) $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) -CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math +CCOMMON_OPT += -O3 -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math else $(warning your compiler is too old to fully support POWER10, getting a newer version of gcc is recommended) -CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math +CCOMMON_OPT += -O3 -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math endif else -CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math +CCOMMON_OPT += -O3 -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math endif ifeq ($(F_COMPILER), IBM) FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr10 -qtune=pwr10 -qfloat=nomaf -qzerosize @@ -34,7 +34,7 @@ endif ifeq ($(CORE), POWER9) ifneq ($(C_COMPILER), PGI) -CCOMMON_OPT += -Ofast -mvsx -fno-fast-math +CCOMMON_OPT += -O3 -mvsx -fno-fast-math ifeq ($(C_COMPILER), GCC) ifneq ($(GCCVERSIONGT4), 1) $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) @@ -70,7 +70,7 @@ endif ifeq ($(CORE), POWER8) ifneq ($(C_COMPILER), PGI) -CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math +CCOMMON_OPT += -O3 -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math else CCOMMON_OPT += -fast -Mvect=simd -Mcache_align endif @@ -118,6 +118,10 @@ ifeq ($(F_COMPILER), GFORTRAN) FCOMMON_OPT += -fno-optimize-sibling-calls endif +ifeq ($(F_COMPILER), FLANGNEW) +override FCOMMON_OPT := $(filter-out -frecursive,$(FCOMMON_OPT)) +endif + FLAMEPATH = $(HOME)/flame/lib #ifeq ($(CORE), CELL) @@ -147,12 +151,15 @@ endif ifdef BINARY64 +#Skip C/Fortran compiler combination checks in AIX if NO_FORTRAN or ONLY_CBLAS is set +ifeq ($(filter 1,$(NO_FORTRAN) $(ONLY_CBLAS)),) ifeq ($(C_COMPILER)$(F_COMPILER)$(OSNAME), GCCIBMAIX) $(error Using GCC and XLF on AIX is not a supported combination.) endif ifeq ($(C_COMPILER)$(F_COMPILER)$(OSNAME), CLANGGFORTRANAIX) $(error Using Clang and gFortran on AIX is not a supported combination.) endif +endif ifeq ($(OSNAME), AIX) ifeq ($(C_COMPILER), GCC) diff --git a/Makefile.riscv64 b/Makefile.riscv64 index 0ee26c1b5c..d8da98d5f0 100644 --- a/Makefile.riscv64 +++ b/Makefile.riscv64 @@ -6,13 +6,20 @@ ifeq ($(CORE), x280) CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static endif +RISCV64_OPT = rv64imafdcv +ifeq ($(BUILD_HFLOAT16), 1) +RISCV64_OPT := $(RISCV64_OPT)_zvfh_zfh +endif +ifeq ($(BUILD_BFLOAT16), 1) +RISCV64_OPT := $(RISCV64_OPT)_zvfbfwma +endif ifeq ($(CORE), RISCV64_ZVL256B) -CCOMMON_OPT += -march=rv64imafdcv_zvl256b -mabi=lp64d -FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d +CCOMMON_OPT += -march=$(RISCV64_OPT)_zvl256b -mabi=lp64d +FCOMMON_OPT += -march=$(RISCV64_OPT) -mabi=lp64d endif ifeq ($(CORE), RISCV64_ZVL128B) -CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d +CCOMMON_OPT += -march=$(RISCV64_OPT) -mabi=lp64d +FCOMMON_OPT += -march=$(RISCV64_OPT) -mabi=lp64d endif ifeq ($(CORE), RISCV64_GENERIC) CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d diff --git a/Makefile.rule b/Makefile.rule index 0d35d8a67d..51ac7bab68 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -308,6 +308,8 @@ COMMON_PROF = -pg # If you want to enable the experimental BFLOAT16 support # BUILD_BFLOAT16 = 1 +# If you want to enable the experimental HFLOAT16 support +# BUILD_HFLOAT16 = 1 # Set the thread number threshold beyond which the job array for the threaded level3 BLAS # will be allocated on the heap rather than the stack. (This array alone requires diff --git a/Makefile.system b/Makefile.system index 38646c3c6b..ab463f53db 100644 --- a/Makefile.system +++ b/Makefile.system @@ -270,20 +270,22 @@ SMALL_MATRIX_OPT = 1 BUILD_BFLOAT16 = 1 else ifeq ($(ARCH), arm64) SMALL_MATRIX_OPT = 1 +BUILD_BFLOAT16 = 1 endif ifeq ($(ARCH), loongarch64) SMALL_MATRIX_OPT = 1 endif ifeq ($(ARCH), arm64) GEMM_GEMV_FORWARD = 1 -GEMM_GEMV_FORWARD_BF16 = 1 +SBGEMM_GEMV_FORWARD = 1 +BGEMM_GEMV_FORWARD = 1 endif ifeq ($(ARCH), riscv) GEMM_GEMV_FORWARD = 1 endif ifeq ($(ARCH), power) GEMM_GEMV_FORWARD = 1 -GEMM_GEMV_FORWARD_BF16 = 1 +SBGEMM_GEMV_FORWARD = 1 endif ifeq ($(SMALL_MATRIX_OPT), 1) @@ -293,8 +295,11 @@ ifneq ($(ONLY_CBLAS), 1) ifeq ($(GEMM_GEMV_FORWARD), 1) CCOMMON_OPT += -DGEMM_GEMV_FORWARD endif -ifeq ($(GEMM_GEMV_FORWARD_BF16), 1) -CCOMMON_OPT += -DGEMM_GEMV_FORWARD_BF16 +ifeq ($(SBGEMM_GEMV_FORWARD), 1) +CCOMMON_OPT += -DSBGEMM_GEMV_FORWARD +endif +ifeq ($(BGEMM_GEMV_FORWARD), 1) +CCOMMON_OPT += -DBGEMM_GEMV_FORWARD endif endif @@ -326,6 +331,7 @@ HAVE_SSE5= HAVE_AVX= HAVE_AVX2= HAVE_FMA3= +HAVE_SME= include $(TOPDIR)/Makefile_kernel.conf endif @@ -393,6 +399,8 @@ GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10) GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11) GCCVERSIONGTEQ12 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 12) +GCCVERSIONGTEQ13 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 13) +GCCVERSIONGTEQ14 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 14) # Note that the behavior of -dumpversion is compile-time-configurable for # gcc-7.x and newer. Use -dumpfullversion there ifeq ($(GCCVERSIONGTEQ7),1) @@ -419,10 +427,8 @@ ifeq ($(OSNAME), Darwin) ifndef MACOSX_DEPLOYMENT_TARGET ifeq ($(ARCH), arm64) export MACOSX_DEPLOYMENT_TARGET=11.0 -ifeq ($(C_COMPILER), GCC) export NO_SVE = 1 -export NO_SME = 1 -endif +# export NO_SME = 1 else export MACOSX_DEPLOYMENT_TARGET=10.8 endif @@ -436,12 +442,10 @@ ifeq (x$(XCVER), x 15) CCOMMON_OPT += -Wl,-ld_classic FCOMMON_OPT += -Wl,-ld_classic endif -ifeq (x$(XCVER), x 16) -ifeq ($(F_COMPILER), GFORTRAN) +ifeq ($(shell [ $(XCVER) -ge 16 ] && echo yes),yes) override CEXTRALIB := $(filter-out(-lto_library, $(CEXTRALIB))) endif endif -endif ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly)) MD5SUM = md5 -r @@ -718,6 +722,11 @@ DYNAMIC_CORE += A64FX endif ifneq ($(NO_SME), 1) DYNAMIC_CORE += ARMV9SME +ifeq ($(OSNAME), Darwin) +ifneq ($(C_COMPILER), GCC) +DYNAMIC_CORE += VORTEXM4 +endif +endif endif DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 @@ -884,7 +893,7 @@ NO_BINARY_MODE = 1 BINARY_DEFINED = 1 ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) -ifeq ($(F_COMPILER), GFORTRAN) +ifeq ($(F_COMPILER), $(filter $(F_COMPILER),GFORTRAN FLANGNEW)) FCOMMON_OPT += -fdefault-integer-8 endif ifeq ($(F_COMPILER), FLANG) @@ -899,7 +908,7 @@ NO_BINARY_MODE = 1 BINARY_DEFINED = 1 ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) -ifeq ($(F_COMPILER), GFORTRAN) +ifeq ($(F_COMPILER), $(filter $(F_COMPILER),GFORTRAN FLANGNEW)) FCOMMON_OPT += -fdefault-integer-8 endif ifeq ($(F_COMPILER), FLANG) @@ -914,7 +923,7 @@ NO_BINARY_MODE = 1 BINARY_DEFINED = 1 ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) -ifeq ($(F_COMPILER), GFORTRAN) +ifeq ($(F_COMPILER), $(filter $(F_COMPILER),GFORTRAN FLANGNEW)) FCOMMON_OPT += -fdefault-integer-8 endif ifeq ($(F_COMPILER), FLANG) @@ -1191,6 +1200,13 @@ endif else ifeq ($(ARCH), $(filter $(ARCH),mips)) FCOMMON_OPT += -mabi=32 endif +ifeq ($(ARCH), $(filter $(ARCH),loongarch64)) +ifdef INTERFACE64 +ifneq ($(INTERFACE64), 0) +FCOMMON_OPT += -fdefault-integer-8 +endif +endif +endif else ifdef BINARY64 ifneq ($(OSNAME), AIX) @@ -1547,6 +1563,9 @@ endif ifeq ($(BUILD_BFLOAT16), 1) CCOMMON_OPT += -DBUILD_BFLOAT16 endif +ifeq ($(BUILD_HFLOAT16), 1) +CCOMMON_OPT += -DBUILD_HFLOAT16 +endif ifeq ($(BUILD_SINGLE), 1) CCOMMON_OPT += -DBUILD_SINGLE=1 endif @@ -1883,17 +1902,23 @@ ifndef NO_MSA export HAVE_MSA export MSA_FLAGS endif +export HAVE_SME export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE export NO_AVX512 export NO_AVX2 export BUILD_BFLOAT16 +export BUILD_HFLOAT16 export NO_LSX export NO_LASX +export BGEMM_UNROLL_M +export BGEMM_UNROLL_N export SBGEMM_UNROLL_M export SBGEMM_UNROLL_N +export SHGEMM_UNROLL_M +export SHGEMM_UNROLL_N export SGEMM_UNROLL_M export SGEMM_UNROLL_N export DGEMM_UNROLL_M diff --git a/Makefile.tail b/Makefile.tail index 54ba649dbf..4a12f531e2 100644 --- a/Makefile.tail +++ b/Makefile.tail @@ -1,4 +1,34 @@ +############################################################################### +# Copyright (c) 2025, The OpenBLAS Project +# All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# 3. Neither the name of the OpenBLAS project nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +############################################################################### + +BBLASOBJS_P = $(BBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) SBBLASOBJS_P = $(SBBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) +SHBLASPBJS_P = $(SHBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) SBLASOBJS_P = $(SBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) DBLASOBJS_P = $(DBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) QBLASOBJS_P = $(QBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) @@ -11,8 +41,8 @@ COMMONOBJS_P = $(COMMONOBJS:.$(SUFFIX)=.$(PSUFFIX)) HPLOBJS_P = $(HPLOBJS:.$(SUFFIX)=.$(PSUFFIX)) -BLASOBJS = $(SBEXTOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) $(CBAUXOBJS) -BLASOBJS_P = $(SBEXTOBJS_P) $(SBBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) $(CBAUXOBJS_P) +BLASOBJS = $(SHBLASOBJS) $(BBLASOBJS) $(SBEXTOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) $(CBAUXOBJS) +BLASOBJS_P = $(SHBLASPBJS_P) $(BBLASOBJS_P) $(SBEXTOBJS_P) $(SBBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) $(CBAUXOBJS_P) ifdef EXPRECISION BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) @@ -24,6 +54,8 @@ BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P) endif +$(SHBLASOBJS) $(SHBLASOBJS_P) : override CFLAGS += -DHFLOAT16 -UDOUBLE -UCOMPLEX +$(BBLASOBJS) $(BBLASOBJS_P) : override CFLAGS += -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX $(SBBLASOBJS) $(SBBLASOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE -UCOMPLEX $(SBLASOBJS) $(SBLASOBJS_P) : override CFLAGS += -UDOUBLE -UCOMPLEX $(DBLASOBJS) $(DBLASOBJS_P) : override CFLAGS += -DDOUBLE -UCOMPLEX @@ -33,6 +65,8 @@ $(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE -DCOMPLEX $(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX $(SBEXTOBJS) $(SBEXTOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE -UCOMPLEX +$(SHBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) +$(BBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(SBBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(DBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) diff --git a/README.md b/README.md index cc9325d39d..6a4fc6a7f2 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ For a general introduction to the BLAS routines, please refer to the extensive d We provide official binary packages for the following platform: * Windows x86/x86_64 + * Windows arm64 (woa) You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/) or from the [Releases section of the GitHub project page](https://github.com/OpenMathLib/OpenBLAS/releases). diff --git a/TargetList.txt b/TargetList.txt index 232e12ffa6..4903261e2c 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -52,6 +52,7 @@ POWER7 POWER8 POWER9 POWER10 +POWER11 PPCG4 PPC970 PPC970MP @@ -101,6 +102,7 @@ CORTEXX2 NEOVERSEN1 NEOVERSEV1 NEOVERSEN2 +NEOVERSEV2 CORTEXA55 EMAG8180 FALKOR @@ -109,6 +111,7 @@ THUNDERX2T99 TSV110 THUNDERX3T110 VORTEX +VORTEXM4 A64FX ARMV8SVE ARMV9SME diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 21e0b96936..9a3aa985c6 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -95,7 +95,7 @@ jobs: vmImage: 'windows-latest' steps: - script: | - mingw32-make CC=gcc FC=gfortran DYNAMIC_ARCH=1 DYNAMIC_LIST="SANDYBRIDGE" + mingw32-make CC=gcc NOLAPACK=1 DYNAMIC_ARCH=1 DYNAMIC_LIST="SANDYBRIDGE" - job: Windows_clang_cmake pool: @@ -155,7 +155,7 @@ jobs: - job: OSX_OpenMP pool: - vmImage: 'macOS-13' + vmImage: 'macOS-14' steps: - script: | brew update @@ -165,19 +165,19 @@ jobs: - job: OSX_GCC_Nothreads pool: - vmImage: 'macOS-13' + vmImage: 'macOS-14' steps: - script: | brew update make USE_THREADS=0 CC=gcc-13 FC=gfortran-13 -- job: OSX_GCC12 +- job: OSX_GCC15 pool: vmImage: 'macOS-latest' steps: - script: | brew update - make CC=gcc-12 FC=gfortran-12 + make CC=gcc-15 FC=gfortran-15 - job: OSX_LLVM_flangnew pool: @@ -201,7 +201,7 @@ jobs: - script: | brew update brew install llvm libomp - make TARGET=CORE2 USE_OPENMP=1 DYNAMIC_ARCH=1 CC=/usr/local/opt/llvm/bin/clang NOFORTRAN=1 + make TARGET=CORE2 USE_OPENMP=1 DYNAMIC_ARCH=1 DYNAMIC_LIST='NEHALEM HASWELL SKYLAKEX' CC=/usr/local/opt/llvm/bin/clang NOFORTRAN=1 - job: OSX_OpenMP_Clang_cmake pool: @@ -215,13 +215,13 @@ jobs: brew install llvm libomp mkdir build cd build - cmake -DTARGET=CORE2 -DUSE_OPENMP=1 -DINTERFACE64=1 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=/usr/local/opt/llvm/bin/clang -DNOFORTRAN=1 -DNO_AVX512=1 .. + cmake -DTARGET=CORE2 -DUSE_OPENMP=1 -DINTERFACE64=1 -DDYNAMIC_ARCH=1 -DDYNAMIC_LIST='NEHALEM HASWELL SKYLAKEX' -DCMAKE_C_COMPILER=/usr/local/opt/llvm/bin/clang -DNOFORTRAN=1 -DNO_AVX512=1 .. make ctest - job: OSX_dynarch_cmake pool: - vmImage: 'macOS-13' + vmImage: 'macOS-14' variables: LD_LIBRARY_PATH: /usr/local/opt/llvm/lib LIBRARY_PATH: /usr/local/opt/llvm/lib @@ -268,7 +268,7 @@ jobs: - job: OSX_NDK_ARMV7 pool: - vmImage: 'macOS-13' + vmImage: 'macOS-14' steps: - script: | brew update @@ -278,35 +278,37 @@ jobs: - job: OSX_IOS_ARMV8 pool: - vmImage: 'macOS-13' + vmImage: 'macOS-14' variables: - CC: /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang - CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_14.2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.2.sdk -arch arm64 -miphoneos-version-min=10.0 + CC: /Applications/Xcode_16.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_16.2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS18.2.sdk -arch arm64 -miphoneos-version-min=10.0 steps: - script: | + ls /Applications/Xcode_16.2.app/Contents/Developer/Platforms/ + ls /Applications/Xcode_16.2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 - job: OSX_IOS_ARMV7 pool: - vmImage: 'macOS-13' + vmImage: 'macOS-14' variables: - CC: /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang - CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_14.2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.2.sdk -arch armv7 -miphoneos-version-min=5.1 + CC: /Applications/Xcode_16.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_16.2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS18.2.sdk -arch armv7 -miphoneos-version-min=5.1 steps: - script: | make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 - job: OSX_xbuild_DYNAMIC_ARM64 pool: - vmImage: 'macOS-13' + vmImage: 'macOS-14' variables: - CC: /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang - CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_14.2.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX13.1.sdk -arch arm64 + CC: /Applications/Xcode_16.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_16.2.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX15.2.sdk -arch arm64 steps: - script: | - ls /Applications/Xcode_14.2.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs + ls /Applications/Xcode_16.2.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs /Applications/Xcode_12.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -arch arm64 --print-supported-cpus - /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang --version + /Applications/Xcode_16.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang --version make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 - job: ALPINE_MUSL diff --git a/benchmark/Makefile b/benchmark/Makefile index c295b14585..d82a06af92 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -1,3 +1,31 @@ +############################################################################### +# Copyright (c) 2025, The OpenBLAS Project +# All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# 3. Neither the name of the OpenBLAS project nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +############################################################################### + TOPDIR = .. include $(TOPDIR)/Makefile.system @@ -56,9 +84,15 @@ GOTO_LAPACK_TARGETS= endif ifeq ($(BUILD_BFLOAT16),1) -GOTO_HALF_TARGETS=sbgemm.goto +GOTO_BFLOAT_TARGETS=bgemm.goto sbgemm.goto bgemv.goto sbgemv.goto +else +GOTO_BFLOAT_TARGETS= +endif + +ifeq ($(BUILD_HFLOAT16),1) +GOTO_HFLOAT_TARGETS=shgemm.goto else -GOTO_HALF_TARGETS= +GOTO_HFLOAT_TARGETS= endif ifeq ($(OSNAME), WINNT) @@ -104,7 +138,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto \ somatcopy.goto domatcopy.goto comatcopy.goto zomatcopy.goto \ - saxpby.goto daxpby.goto caxpby.goto zaxpby.goto $(GOTO_HALF_TARGETS) + saxpby.goto daxpby.goto caxpby.goto zaxpby.goto $(GOTO_BFLOAT_TARGETS) $(GOTO_HFLOAT_TARGETS) acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ @@ -278,7 +312,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ smin.goto dmin.goto \ saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ somatcopy.goto domatcopy.goto comatcopy.goto zomatcopy.goto \ - snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS) + snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_BFLOAT_TARGETS) $(GOTO_HFLOAT_TARGETS) acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ @@ -629,8 +663,19 @@ zcholesky.essl : zcholesky.$(SUFFIX) ##################################### Sgemm #################################################### ifeq ($(BUILD_BFLOAT16),1) +bgemm.goto : bgemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sbgemm.goto : sbgemm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm +bgemv.goto : bgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm +sbgemv.goto : sbgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm +endif + +ifeq ($(BUILD_HFLOAT16),1) +shgemm.goto : shgemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm endif sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME) @@ -2959,8 +3004,15 @@ zcholesky.$(SUFFIX) : cholesky.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ ifeq ($(BUILD_BFLOAT16),1) +bgemm.$(SUFFIX) : gemm.c + $(CC) $(CFLAGS) -c -DBFLOAT16 -DBGEMM -UCOMPLEX -UDOUBLE -o $(@F) $^ sbgemm.$(SUFFIX) : gemm.c - $(CC) $(CFLAGS) -c -DHALF -UCOMPLEX -UDOUBLE -o $(@F) $^ + $(CC) $(CFLAGS) -c -DBFLOAT16 -UCOMPLEX -UDOUBLE -o $(@F) $^ +endif + +ifeq ($(BUILD_HFLOAT16),1) +shgemm.$(SUFFIX) : gemm.c + $(CC) $(CFLAGS) -c -DHFLOAT16 -UCOMPLEX -UDOUBLE -o $(@F) $^ endif sgemm.$(SUFFIX) : gemm.c @@ -3098,6 +3150,13 @@ dgemv.$(SUFFIX) : gemv.c cgemv.$(SUFFIX) : gemv.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ +ifeq ($(BUILD_BFLOAT16),1) +bgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -DBFLOAT16 -DBGEMM -UCOMPLEX -UDOUBLE -o $(@F) $^ +sbgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -DBFLOAT16 -UCOMPLEX -UDOUBLE -o $(@F) $^ +endif () + zgemv.$(SUFFIX) : gemv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ diff --git a/benchmark/gemm.c b/benchmark/gemm.c index 35f5096f35..704e332251 100644 --- a/benchmark/gemm.c +++ b/benchmark/gemm.c @@ -33,10 +33,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef DOUBLE #define GEMM BLASFUNC(dgemm) -#elif defined(HALF) +#elif defined(BFLOAT16) && defined(BGEMM) +#define GEMM BLASFUNC(bgemm) +#elif defined(BFLOAT16) #define GEMM BLASFUNC(sbgemm) +#undef IFLOAT +#define IFLOAT bfloat16 +#elif defined(HFLOAT16) +#define GEMM BLASFUNC(shgemm) +#undef IFLOAT +#define IFLOAT hfloat16 #else #define GEMM BLASFUNC(sgemm) +#define IFLOAT float #endif #else @@ -53,8 +62,18 @@ int main(int argc, char *argv[]){ IFLOAT *a, *b; FLOAT *c; +#ifdef BGEMM + blasint one=1; + blasint two=2; + float alpha_in[] = {1.0, 0.0}; + float beta_in[] = {0.0, 0.0}; + FLOAT alpha[2], beta[2]; + sbstobf16_(&two, alpha_in, &one, alpha, &one); + sbstobf16_(&two, beta_in, &one, beta, &one); +#else FLOAT alpha[] = {1.0, 0.0}; FLOAT beta [] = {0.0, 0.0}; +#endif char transa = 'N'; char transb = 'N'; blasint m, n, k, i, j, lda, ldb, ldc; diff --git a/benchmark/gemv.c b/benchmark/gemv.c index fc39f3f3d4..884a9bb273 100644 --- a/benchmark/gemv.c +++ b/benchmark/gemv.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project +Copyright (c) 2014, 2025 The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -34,6 +34,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef DOUBLE #define GEMV BLASFUNC(dgemv) +#elif defined(BFLOAT16) && defined(BGEMM) +#define GEMV BLASFUNC(bgemv) +#elif defined(BFLOAT16) +#define GEMV BLASFUNC(sbgemv) +#undef IFLOAT +#define IFLOAT bfloat16 #else #define GEMV BLASFUNC(sgemv) #endif @@ -49,9 +55,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif int main(int argc, char *argv[]){ - FLOAT *a, *x, *y; - FLOAT alpha[] = {1.0, 1.0}; - FLOAT beta [] = {1.0, 0.0}; + IFLOAT *a, *x; + FLOAT *y; +#ifdef BGEMM + blasint one=1; + blasint two=2; + float alpha_in[] = {1.0, 0.0}; + float beta_in[] = {0.0, 0.0}; + FLOAT alpha[2], beta[2]; + sbstobf16_(&two, alpha_in, &one, alpha, &one); + sbstobf16_(&two, beta_in, &one, beta, &one); +#else + FLOAT alpha[] = {1.0, 0.0}; + FLOAT beta [] = {0.0, 0.0}; +#endif char trans='N'; blasint m, i, j; blasint inc_x=1,inc_y=1; @@ -97,11 +114,11 @@ int main(int argc, char *argv[]){ fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops); - if (( a = (FLOAT *)malloc(sizeof(FLOAT) * tomax * tomax * COMPSIZE)) == NULL){ + if (( a = (IFLOAT *)malloc(sizeof(IFLOAT) * tomax * tomax * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_x) * COMPSIZE)) == NULL){ + if (( x = (IFLOAT *)malloc(sizeof(IFLOAT) * tomax * abs(inc_x) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } @@ -125,7 +142,7 @@ int main(int argc, char *argv[]){ fprintf(stderr, " %6dx%d : ", (int)m,(int)n); for(j = 0; j < m; j++){ for(i = 0; i < n * COMPSIZE; i++){ - a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((IFLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } @@ -133,7 +150,7 @@ int main(int argc, char *argv[]){ { for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){ - x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + x[i] = ((IFLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ diff --git a/c_check b/c_check index 9801b87783..40db574702 100755 --- a/c_check +++ b/c_check @@ -335,7 +335,14 @@ if [ "$architecture" = "arm64" ]; then fi no_sme=0 +is_appleclang=0 if [ "$architecture" = "arm64" ]; then + if [ "$compiler" = "CLANG" ]; then + data=`$compiler_name --version` + case "$data" in Apple*) + is_appleclang=1 + esac + fi tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') tmpf="$tmpd/a.S" printf ".text \n.global sme_test\n\nsme_test:\nsmstart\nsmstop\nret\n">> "$tmpf" @@ -469,6 +476,7 @@ done [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n" [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n" [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n" + [ "$is_appleclang" -eq 1 ] && printf "APPLECLANG=1\n" exit 0 } @@ -499,6 +507,7 @@ done [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n" [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n" [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n" + [ "$is_appleclang" -eq 1 ] && printf "APPLECLANG=1\n" [ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n" [ "$no_lasx" -eq 1 ] && printf "NO_LASX=1\n" } >> "$makefile" diff --git a/cblas.h b/cblas.h index 83686f7433..8395f1b8b2 100644 --- a/cblas.h +++ b/cblas.h @@ -1,3 +1,31 @@ +/*************************************************************************** + * Copyright (c) 2025, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + #ifndef CBLAS_H #define CBLAS_H @@ -428,6 +456,14 @@ void cblas_cgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enu void cblas_zgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array, OPENBLAS_CONST void * alpha_array, OPENBLAS_CONST void ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST void ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST void * beta_array, void ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size); +void cblas_sgemm_batch_strided(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, OPENBLAS_CONST float * A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST blasint stridea, OPENBLAS_CONST float * B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST blasint strideb, OPENBLAS_CONST float beta, float * C, OPENBLAS_CONST blasint ldc, OPENBLAS_CONST blasint stridec, OPENBLAS_CONST blasint group_size); + +void cblas_dgemm_batch_strided(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double alpha, OPENBLAS_CONST double * A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST blasint stridea, OPENBLAS_CONST double * B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST blasint strideb, OPENBLAS_CONST double beta, double * C, OPENBLAS_CONST blasint ldc, OPENBLAS_CONST blasint stridec, OPENBLAS_CONST blasint group_size); + +void cblas_cgemm_batch_strided(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void * alpha, OPENBLAS_CONST void * A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST blasint stridea, OPENBLAS_CONST void * B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST blasint strideb, OPENBLAS_CONST void * beta, void * C, OPENBLAS_CONST blasint ldc, OPENBLAS_CONST blasint stridec, OPENBLAS_CONST blasint group_size); + +void cblas_zgemm_batch_strided(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void * alpha, OPENBLAS_CONST void * A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST blasint stridea, OPENBLAS_CONST void * B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST blasint strideb, OPENBLAS_CONST void * beta, void * C, OPENBLAS_CONST blasint ldc, OPENBLAS_CONST blasint stridec, OPENBLAS_CONST blasint group_size); + /*** BFLOAT16 and INT8 extensions ***/ /* convert float array to BFLOAT16 array by rounding */ void cblas_sbstobf16(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *in, OPENBLAS_CONST blasint incin, bfloat16 *out, OPENBLAS_CONST blasint incout); @@ -437,15 +473,23 @@ void cblas_sbdtobf16(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *in, OPEN void cblas_sbf16tos(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, float *out, OPENBLAS_CONST blasint incout); /* convert BFLOAT16 array to double array */ void cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, double *out, OPENBLAS_CONST blasint incout); +void cblas_bgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 alpha, OPENBLAS_CONST bfloat16 *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 beta, bfloat16 *y, OPENBLAS_CONST blasint incy); /* dot production of BFLOAT16 input arrays, and output as float */ float cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy); void cblas_sbgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy); +void cblas_bgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, + OPENBLAS_CONST bfloat16 alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST bfloat16 beta, bfloat16 *C, OPENBLAS_CONST blasint ldc); void cblas_sbgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); void cblas_sbgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array, OPENBLAS_CONST float * alpha_array, OPENBLAS_CONST bfloat16 ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST bfloat16 ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST float * beta_array, float ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size); +void cblas_sbgemm_batch_strided(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 * A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST blasint stridea, OPENBLAS_CONST bfloat16 * B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST blasint strideb, OPENBLAS_CONST float beta, float * C, OPENBLAS_CONST blasint ldc, OPENBLAS_CONST blasint stridec, OPENBLAS_CONST blasint group_size); +/*** FLOAT16 extensions ***/ +void cblas_shgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, + OPENBLAS_CONST float alpha, OPENBLAS_CONST hfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST hfloat16 *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); + #ifdef __cplusplus } #endif /* __cplusplus */ diff --git a/cmake/OpenBLASConfig.cmake.in b/cmake/OpenBLASConfig.cmake.in index 87a1621b4a..2b5387a0c0 100644 --- a/cmake/OpenBLASConfig.cmake.in +++ b/cmake/OpenBLASConfig.cmake.in @@ -50,30 +50,30 @@ set(PN OpenBLAS) # need to check that the @USE_*@ evaluate to something cmake can perform boolean logic upon if(@USE_OPENMP@) - set(${PN}_openmp_FOUND 1) + set(${PN}@SUFFIX64@_openmp_FOUND 1) elseif(@USE_THREAD@) - set(${PN}_pthread_FOUND 1) + set(${PN}@SUFFIX64@_pthread_FOUND 1) else() - set(${PN}_serial_FOUND 1) + set(${PN}@SUFFIX64@_serial_FOUND 1) endif() -check_required_components(${PN}) +check_required_components(${PN}@SUFFIX64@) #----------------------------------------------------------------------------- # Don't include targets if this file is being picked up by another # project which has already built this as a subproject #----------------------------------------------------------------------------- -if(NOT TARGET ${PN}::OpenBLAS) - include("${CMAKE_CURRENT_LIST_DIR}/${PN}Targets.cmake") +if(NOT TARGET ${PN}@SUFFIX64@::OpenBLAS) + include("${CMAKE_CURRENT_LIST_DIR}/${PN}@SUFFIX64@Targets.cmake") - get_property(_loc TARGET ${PN}::OpenBLAS PROPERTY LOCATION) - set(${PN}_LIBRARY ${_loc}) - get_property(_ill TARGET ${PN}::OpenBLAS PROPERTY INTERFACE_LINK_LIBRARIES) - set(${PN}_LIBRARIES ${_ill}) + get_property(_loc TARGET ${PN}@SUFFIX64@::OpenBLAS PROPERTY LOCATION) + set(${PN}@SUFFIX64@_LIBRARY ${_loc}) + get_property(_ill TARGET ${PN}@SUFFIX64@::OpenBLAS PROPERTY INTERFACE_LINK_LIBRARIES) + set(${PN}@SUFFIX64@_LIBRARIES ${_ill}) - get_property(_id TARGET ${PN}::OpenBLAS PROPERTY INCLUDE_DIRECTORIES) - set(${PN}_INCLUDE_DIR ${_id}) - get_property(_iid TARGET ${PN}::OpenBLAS PROPERTY INTERFACE_INCLUDE_DIRECTORIES) - set(${PN}_INCLUDE_DIRS ${_iid}) + get_property(_id TARGET ${PN}@SUFFIX64@::OpenBLAS PROPERTY INCLUDE_DIRECTORIES) + set(${PN}@SUFFIX64@_INCLUDE_DIR ${_id}) + get_property(_iid TARGET ${PN}@SUFFIX64@::OpenBLAS PROPERTY INTERFACE_INCLUDE_DIRECTORIES) + set(${PN}@SUFFIX64@_INCLUDE_DIRS ${_iid}) endif() diff --git a/cmake/arch.cmake b/cmake/arch.cmake index d9a7aafd62..ef0b63654a 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -40,13 +40,16 @@ if (DYNAMIC_ARCH) endif () if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 14) # SME ACLE supported in GCC >= 14 set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME) + endif() + if (${CMAKE_C_COMPILER_ID} MATCHES "Clang" AND ${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") + set(DYNAMIC_CORE ${DYNAMIC_CORE} VORTEXM4) endif() elseif (${CMAKE_C_COMPILER_ID} MATCHES "Clang") if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 11) # SVE ACLE supported in LLVM >= 11 set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) endif () - if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 19) # SME ACLE supported in LLVM >= 19 - set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME) + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 19 OR (${CMAKE_C_COMPILER_ID} MATCHES AppleClang AND ${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 17) ) # SME ACLE supported in LLVM >= 19 and AppleClang >= 17 + set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME VORTEXM4) endif() endif () if (DYNAMIC_LIST) @@ -94,7 +97,13 @@ if (DYNAMIC_ARCH) set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST}) endif () endif () - + + if (ZARCH) + set(DYNAMIC_CORE Z13 Z14 ZARCH_GENERIC) + set(DYN_Z13 1) + set(DYN_Z14 1) + endif () + if (LOONGARCH64) set(DYNAMIC_CORE LA64_GENERIC LA264 LA464) endif () diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 66b316f7f0..27afd357a5 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -1,3 +1,30 @@ +############################################################################### +# Copyright (c) 2025, The OpenBLAS Project +# All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# 3. Neither the name of the OpenBLAS project nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +############################################################################### ## ## Author: Hank Anderson ## Description: Ported from portion of OpenBLAS/Makefile.system @@ -59,6 +86,10 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS set(BINARY_DEFINED 1) endif () + if (ZARCH) + set (BINARY_DEFINED 1) + endif () + if (CMAKE_SYSTEM_NAME STREQUAL "AIX") set(BINARY_DEFINED 1) endif () @@ -182,7 +213,7 @@ endif () if (${CORE} STREQUAL A64FX) if (NOT DYNAMIC_ARCH) - if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) + if (${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC" AND NOT NO_SVE) set (CCOMMON_OPT "${CCOMMON_OPT} -tp=a64fx") elseif (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0) set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=a64fx") @@ -192,17 +223,35 @@ if (${CORE} STREQUAL A64FX) endif () endif () +if (${CORE} STREQUAL NEOVERSEV2) + if (NOT DYNAMIC_ARCH) + if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) + set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-v2") + elseif (${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC" AND NOT NO_SVE) + set (CCOMMON_OPT "${CCOMMON_OPT} -tp=neoverse-v2") + else () + if (${GCC_VERSION} VERSION_GREATER 13.0 OR ${GCC_VERSION} VERSION_EQUAL 13.0) + set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=neoverse-v2") + elseif (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve+bf16 -mtune=neoverse-v1") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve+bf16") + endif() + endif () + endif () +endif () + if (${CORE} STREQUAL NEOVERSEN2) if (NOT DYNAMIC_ARCH) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") - elseif (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) + elseif (${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC" AND NOT NO_SVE) set (CCOMMON_OPT "${CCOMMON_OPT} -tp=neoverse-v2") else () - if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) + if (${GCC_VERSION} VERSION_GREATER 11.1 OR ${GCC_VERSION} VERSION_EQUAL 11.1) set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") else () - set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve+bf16") endif() endif () endif () @@ -211,14 +260,14 @@ endif () if (${CORE} STREQUAL NEOVERSEV1) if (NOT DYNAMIC_ARCH) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) - set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1") - elseif (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) + set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.4-a+sve+bf16 -mtune=neoverse-v1") + elseif (${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC" AND NOT NO_SVE) set (CCOMMON_OPT "${CCOMMON_OPT} -tp=neoverse-v1") else () if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) - set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1") + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve+bf16 -mtune=neoverse-v1") else () - set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve+bf16") endif() endif() endif () @@ -226,7 +275,7 @@ endif () if (${CORE} STREQUAL NEOVERSEN1) if (NOT DYNAMIC_ARCH) - if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) + if (${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC" AND NOT NO_SVE) set (CCOMMON_OPT "${CCOMMON_OPT} -tp=neoverse-n1") elseif (${GCC_VERSION} VERSION_GREATER 9.4 OR ${GCC_VERSION} VERSION_EQUAL 9.4) set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a -mtune=neoverse-n1") @@ -236,11 +285,23 @@ if (${CORE} STREQUAL NEOVERSEN1) endif () endif () +if (${CORE} STREQUAL AMPEREONE) + if (NOT DYNAMIC_ARCH) + if (${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC") + set (CCOMMON_OPT "${CCOMMON_OPT} -tp=neoverse-n1") + elseif (${GCC_VERSION} VERSION_GREATER 12.1) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.6-a+crypto+crc+fp16+sha3+rng -mtune=ampereone") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.6-a+fp16") + endif() + endif () +endif () + if (${CORE} STREQUAL ARMV8SVE) if (NOT DYNAMIC_ARCH) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8-a+sve") - elseif (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) + elseif (${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC" AND NOT NO_SVE) set (CCOMMON_OPT "${CCOMMON_OPT} -tp=host") else () set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") @@ -250,11 +311,28 @@ endif () if (${CORE} STREQUAL ARMV9SME) if (NOT DYNAMIC_ARCH) - if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) + if (${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC" AND NOT NO_SVE) set (CCOMMON_OPT "${CCOMMON_OPT} -tp=host") else () set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv9-a+sme") + if (${OSNAME} STREQUAL Windows AND ${CMAKE_C_COMPILER_ID} MATCHES "Clang" ) + set (CCOMMON_OPT "${CCOMMON_OPT} --aarch64-stack-hazard-size=0") endif () + endif () + endif () +endif () + +if (${CORE} STREQUAL VORTEXM4) + if (NOT DYNAMIC_ARCH) + if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) + set (CCOMMON_OPT "${CCOMMON_OPT} -tp=host") + else () + if (${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sme -mcpu=apple-m4") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a -mcpu=apple-m4") + endif () + endif () endif () endif () @@ -330,6 +408,30 @@ if (${CORE} STREQUAL PPCG4) endif () endif () + +if ((${CORE} STREQUAL RISCV64_ZVL128B) OR (${CORE} STREQUAL RISCV64_ZVL256B)) + set (RISCV64_OPT "rv64imafdcv") + if (BUILD_BFLOAT16) + set (RISCV64_OPT "${RISCV64_OPT}_zvfbfwma") + endif() + if (BUILD_HFLOAT16) + set (RISCV64_OPT "${RISCV64_OPT}_zvfh_zfh") + endif() + if (${CORE} STREQUAL RISCV64_ZVL256B) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=${RISCV64_OPT}_zvl256b -mabi=lp64d") + endif() + if (${CORE} STREQUAL RISCV64_ZVL128B) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=${RISCV64_OPT}_zvl128b -mabi=lp64d") + endif() +endif() +if (${CORE} STREQUAL RISCV64_GENERIC) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=rv64imafdc -mabi=lp64d") +endif() +if (${CORE} STREQUAL x280) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d") +endif() + + if (NOT DYNAMIC_ARCH) if (HAVE_AVX2) set (CCOMMON_OPT "${CCOMMON_OPT} -mavx2") diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 2cea6d9e6e..3b9e3849d9 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -1,3 +1,31 @@ +############################################################################### +# Copyright (c) 2025, The OpenBLAS Project +# All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# 3. Neither the name of the OpenBLAS project nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +############################################################################### + # helper functions for the kernel CMakeLists.txt function(SetFallback KERNEL SOURCE_PATH) @@ -82,6 +110,7 @@ macro(SetDefaultL1) SetFallback(SROTMKERNEL rotm.S) SetFallback(DROTMKERNEL rotm.S) SetFallback(QROTMKERNEL rotm.S) + SetFallback(BSCALKERNEL ../generic/scal.c) SetFallback(SSCALKERNEL scal.S) SetFallback(DSCALKERNEL scal.S) SetFallback(CSCALKERNEL zscal.S) @@ -141,9 +170,15 @@ if (BUILD_BFLOAT16) SetFallback(SHSWAPKERNEL ../arm/swap.c) SetFallback(TOBF16KERNEL ../x86_64/tobf16.c) SetFallback(BF16TOKERNEL ../x86_64/bf16to.c) + SetFallback(BGEMVNKERNEL ../generic/gemv_n.c) + SetFallback(BGEMVTKERNEL ../generic/gemv_t.c) SetFallback(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) SetFallback(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) endif () +if (BUILD_HFLOAT16) + SetFallback(SHGEMVNKERNEL ../generic/gemv_n.c) + SetFallback(SHGEMVTKERNEL ../generic/gemv_t.c) +endif () endmacro () macro(SetDefaultL2) @@ -193,8 +228,14 @@ macro(SetDefaultL2) SetFallback(XHEMV_V_KERNEL ../generic/zhemv_k.c) SetFallback(XHEMV_M_KERNEL ../generic/zhemv_k.c) if (BUILD_BFLOAT16) + SetFallback(BGEMVNKERNEL ../generic/gemv_n.c) + SetFallback(BGEMVTKERNEL ../generic/gemv_t.c) SetFallback(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) SetFallback(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) +endif () +if (BUILD_HFLOAT16) + SetFallback(SHGEMVNKERNEL ../generic/gemv_n.c) + SetFallback(SHGEMVTKERNEL ../generic/gemv_t.c) SetFallback(SHGERKERNEL ../generic/ger.c) endif () endmacro () @@ -206,6 +247,16 @@ macro(SetDefaultL3) SetFallback(ZGEADD_KERNEL ../generic/zgeadd.c) if (BUILD_BFLOAT16) SetFallback(SHGEADD_KERNEL ../generic/geadd.c) + SetFallback(BGEMMKERNEL ../generic/gemmkernel_2x2.c) + SetFallback(BGEMM_BETA ../generic/gemm_beta.c) + SetFallback(BGEMMINCOPY ../generic/gemm_ncopy_2.c) + SetFallback(BGEMMITCOPY ../generic/gemm_tcopy_2.c) + SetFallback(BGEMMONCOPY ../generic/gemm_ncopy_2.c) + SetFallback(BGEMMOTCOPY ../generic/gemm_tcopy_2.c) + SetFallback(BGEMMINCOPYOBJ bgemm_incopy.o) + SetFallback(BGEMMITCOPYOBJ bgemm_itcopy.o) + SetFallback(BGEMMONCOPYOBJ bgemm_oncopy.o) + SetFallback(BGEMMOTCOPYOBJ bgemm_otcopy.o) SetFallback(SBGEMMKERNEL ../generic/gemmkernel_2x2.c) SetFallback(SBGEMM_BETA ../generic/gemm_beta.c) SetFallback(SBGEMMINCOPY ../generic/gemm_ncopy_2.c) @@ -217,5 +268,16 @@ if (BUILD_BFLOAT16) SetFallback(SBGEMMONCOPYOBJ sbgemm_oncopy.o) SetFallback(SBGEMMOTCOPYOBJ sbgemm_otcopy.o) endif () - +if (BUILD_HFLOAT16) + SetFallback(SHGEMMKERNEL ../generic/gemmkernel_2x2.c) + SetFallback(SHGEMM_BETA ../generic/gemm_beta.c) + SetFallback(SHGEMMINCOPY ../generic/gemm_ncopy_2.c) + SetFallback(SHGEMMITCOPY ../generic/gemm_tcopy_2.c) + SetFallback(SHGEMMONCOPY ../generic/gemm_ncopy_2.c) + SetFallback(SHGEMMOTCOPY ../generic/gemm_tcopy_2.c) + SetFallback(SHGEMMINCOPYOBJ shgemm_incopy.o) + SetFallback(SHGEMMITCOPYOBJ shgemm_itcopy.o) + SetFallback(SHGEMMONCOPYOBJ shgemm_oncopy.o) + SetFallback(SHGEMMOTCOPYOBJ shgemm_otcopy.o) +endif () endmacro () diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index 6a74fb7640..a8d1c601c8 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -11,7 +11,7 @@ set(SCLAUX la_constants.f90 sbdsdc.f sbdsqr.f sdisna.f slabad.f slacpy.f sladiv.f slae2.f slaebz.f - slaed0.f slaed1.f slaed2.f slaed3.f slaed4.f slaed5.f slaed6.f + slaed0.f slaed1.f slaed2.f slaed4.f slaed5.f slaed6.f slaed7.f slaed8.f slaed9.f slaeda.f slaev2.f slagtf.f slagts.f slamrg.f slanst.f slapy2.f slapy3.f slarnv.f @@ -31,7 +31,7 @@ set(DZLAUX dbdsdc.f dbdsvdx.f dstevx.f dstein.f dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f dlaebz.f - dlaed0.f dlaed1.f dlaed2.f dlaed3.f dlaed4.f dlaed5.f dlaed6.f + dlaed0.f dlaed1.f dlaed2.f dlaed4.f dlaed5.f dlaed6.f dlaed7.f dlaed8.f dlaed9.f dlaeda.f dlaev2.f dlagtf.f dlagts.f dlamrg.f dlanst.f dlapy2.f dlapy3.f dlarnv.f @@ -517,7 +517,7 @@ set(SCLAUX scombssq.c sbdsvdx.c sstevx.c sstein.c sbdsdc.c sbdsqr.c sdisna.c slabad.c slacpy.c sladiv.c slae2.c slaebz.c - slaed0.c slaed1.c slaed2.c slaed3.c slaed4.c slaed5.c slaed6.c + slaed0.c slaed1.c slaed2.c slaed4.c slaed5.c slaed6.c slaed7.c slaed8.c slaed9.c slaeda.c slaev2.c slagtf.c slagts.c slamrg.c slanst.c slapy2.c slapy3.c slarnv.c @@ -536,7 +536,7 @@ set(DZLAUX dbdsdc.c dbdsvdx.c dstevx.c dstein.c dbdsqr.c ddisna.c dlabad.c dlacpy.c dladiv.c dlae2.c dlaebz.c - dlaed0.c dlaed1.c dlaed2.c dlaed3.c dlaed4.c dlaed5.c dlaed6.c + dlaed0.c dlaed1.c dlaed2.c dlaed4.c dlaed5.c dlaed6.c dlaed7.c dlaed8.c dlaed9.c dlaeda.c dlaev2.c dlagtf.c dlagts.c dlamrg.c dlanst.c dlapy2.c dlapy3.c dlarnv.c diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 4c100a770a..c503683015 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -1,3 +1,31 @@ +############################################################################### +# Copyright (c) 2025, The OpenBLAS Project +# All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# 3. Neither the name of the OpenBLAS project nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +############################################################################### + ## ## Author: Hank Anderson ## Description: Ported from OpenBLAS/Makefile.prebuild @@ -71,7 +99,6 @@ if (${COMPILER_ID} STREQUAL "GNU") endif () string(TOUPPER ${ARCH} UC_ARCH) - file(WRITE ${TARGET_CONF_TEMP} "#define OS_${HOST_OS}\t1\n" "#define ARCH_${UC_ARCH}\t1\n" @@ -83,6 +110,10 @@ if (${HOST_OS} STREQUAL "WINDOWSSTORE") file(APPEND ${TARGET_CONF_TEMP} "#define OS_WINNT\t1\n") endif () +if (${HOST_OS} STREQUAL CYGWIN) + file(APPEND ${TARGET_CONF_TEMP} + "#define OS_CYGWIN_NT\t1\n") +endif () # f_check if (NOT NOFORTRAN) @@ -131,6 +162,8 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(HAVE_SSE2 1) set(HAVE_SSE3 1) set(HAVE_SSSE3 1) + set(BGEMM_UNROLL_M 8) + set(BGEMM_UNROLL_N 4) set(SBGEMM_UNROLL_M 8) set(SBGEMM_UNROLL_N 4) set(SGEMM_UNROLL_M 8) @@ -1195,7 +1228,34 @@ endif () set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) - elseif ("${TCORE}" STREQUAL "VORTEX") + elseif ("${TCORE}" STREQUAL "AMPEREONE") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_CODE_SIZE\t16384\n" + "#define L1_CODE_LINESIZE\t64\n" + "#define L1_CODE_ASSOCIATIVE\t4\n" + "#define L1_DATA_SIZE\t65536\n" + "#define L1_DATA_LINESIZE\t64\n" + "#define L1_DATA_ASSOCIATIVE\t4\n" + "#define L2_SIZE\t2097152\n\n" + "#define L2_LINESIZE\t64\n" + "#define L2_ASSOCIATIVE\t8\n" + "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define DTB_SIZE\t4096\n" + "#define HAVE_VFPV4\n" + "#define HAVE_VFPV3\n" + "#define HAVE_VFP\n" + "#define HAVE_NEON\n" + "#define ARMV8\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "VORTEX" OR "${TCORE}" STREQUAL "VORTEXM4") file(APPEND ${TARGET_CONF_TEMP} "#define ARMV8\n" "#define L1_CODE_SIZE\t32768\n" @@ -1253,7 +1313,7 @@ endif () file(APPEND ${TARGET_CONF_TEMP} "#define L1_DATA_SIZE\t32768\n" "#define L1_DATA_LINESIZE\t64\n" - "#define L2_SIZE\t262144\n" + "#define L2_SIZE\t1048576\n" "#define L2_LINESIZE\t64\n" "#define DTB_DEFAULT_ENTRIES\t64\n" "#define DTB_SIZE\t4096\n" @@ -1350,6 +1410,78 @@ endif () set(ZGEMM_UNROLL_M 8) set(ZGEMM_UNROLL_N 2) set(SYMV_P 8) + elseif ("${TCORE}" STREQUAL "C910V") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_DATA_SIZE 32768\n" + "#define L1_DATA_LINESIZE 32\n" + "#define L2_SIZE 1048576\n" + "#define L2_LINESIZE 32 \n" + "#define DTB_DEFAULT_ENTRIES 128\n" + "#define DTB_SIZE 4096\n" + "#define L2_ASSOCIATIVE 4\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 2) + set(CGEMM_UNROLL_N 2) + set(ZGEMM_UNROLL_M 2) + set(ZGEMM_UNROLL_N 2) + set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "x280") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_DATA_SIZE 65536\n" + "#define L1_LINESIZE 32 \n" + "#define L2_SIZE 2097152\n" + "#define L2_LINESIZE 32 \n" + "#define DTB_DEFAULT_ENTRIES 128\n" + "#define DTB_SIZE 4096\n" + "#define L2_ASSOCIATIVE 4\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 8) + set(DGEMM_UNROLL_M 16) + set(DGEMM_UNROLL_N 8) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 8) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "RISCV64_ZVL128B") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_DATA_SIZE 32768\n" + "#define L1_DATA_LINESIZE 32\n" + "#define L2_SIZE 1048576\n" + "#define L2_LINESIZE 32 \n" + "#define DTB_DEFAULT_ENTRIES 128\n" + "#define DTB_SIZE 4096\n" + "#define L2_ASSOCIATIVE 4\n") + set(SGEMM_UNROLL_M 8) + set(SGEMM_UNROLL_N 8) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "RISCV64_ZVL256B") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_DATA_SIZE 65536\n" + "#define L1_DATA_LINESIZE 32\n" + "#define L2_SIZE 2097152\n" + "#define L2_LINESIZE 32 \n" + "#define DTB_DEFAULT_ENTRIES 128\n" + "#define DTB_SIZE 4096\n" + "#define L2_ASSOCIATIVE 4\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 8) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 8) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 8) + set(ZGEMM_UNROLL_M 8) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) elseif ("${TCORE}" STREQUAL "GENERIC") file(APPEND ${TARGET_CONF_TEMP} "#define L1_DATA_SIZE 32768\n" @@ -1361,7 +1493,7 @@ endif () "#define L2_ASSOCIATIVE 8\n") elseif ("${TCORE}" STREQUAL "RISCV64_GENERIC") file(APPEND ${TARGET_CONF_TEMP} - "#define L1_DATA_SIZE 32768\n" + "#define L1_DATA_SIZE 32768\n" "#define L1_DATA_LINESIZE 32\n" "#define L2_SIZE 1048576\n" "#define L2_LINESIZE 32 \n" @@ -1507,6 +1639,8 @@ else(NOT CMAKE_CROSSCOMPILING) unset (HAVE_VFP) unset (HAVE_VFPV3) unset (HAVE_VFPV4) + unset (HAVE_SVE) + unset (HAVE_SME) message(STATUS "Running getarch") # use the cmake binary w/ the -E param to run a shell command in a cross-platform way diff --git a/cmake/system.cmake b/cmake/system.cmake index 14b2c65b11..f16be31df5 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -1,3 +1,30 @@ +############################################################################### +# Copyright (c) 2025, The OpenBLAS Project +# All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# 3. Neither the name of the OpenBLAS project nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +############################################################################### ## ## Author: Hank Anderson ## Description: Ported from OpenBLAS/Makefile.system @@ -289,12 +316,19 @@ if (DEFINED TARGET) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") endif() +if (${TARGET} STREQUAL Z13) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=z13 -mzvector") +endif() +if (${TARGET} STREQUAL Z14) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=z14 -mzvector") +endif() + if (${TARGET} STREQUAL NEOVERSEV1) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve_intrinsics -march=armv8.4-a+sve+bf16 -mtune=neoverse-v1") else () if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 10.4 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 10.4) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve -mtune=neoverse-v1") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve+bf16 -mtune=neoverse-v1") else () message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${CMAKE_C_COMPILER_VERSION} does not support Neoverse V1.") endif() @@ -311,6 +345,19 @@ if (${TARGET} STREQUAL NEOVERSEV1) endif() endif() endif() + if (${TARGET} STREQUAL NEOVERSEV2) + if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv9-a+sve+sve2+bf16 -mtune=neoverse-v2") + else () + if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 13.0 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 13.0) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=neoverse-v2") + elseif (CMAKE_C_COMPILER_VERSION VERSION_GREATER 10.4 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 10.4) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve+bf16 -mtune=neoverse-v1") + else () + message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} ${CMAKE_C_COMPILER_VERSION} does not support Neoverse V2.") + endif() + endif() + endif() if (${TARGET} STREQUAL ARMV8SVE) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve") @@ -320,6 +367,15 @@ if (${TARGET} STREQUAL NEOVERSEV1) endif() if (${TARGET} STREQUAL ARMV9SME) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv9-a+sme -O3") + if (${CMAKE_SYSTEM_NAME} STREQUAL Windows AND ${CMAKE_C_COMPILER_ID} MATCHES "Clang") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} --aarch64-stack-hazard-size=0") + endif() + endif() + if (${TARGET} STREQUAL VORTEXM4) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sme -O3") + if (${CMAKE_SYSTEM_NAME} STREQUAL Windows AND ${CMAKE_C_COMPILER_ID} MATCHES "Clang") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} --aarch64-stack-hazard-size=0") + endif() endif() if (${TARGET} STREQUAL A64FX) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) @@ -334,6 +390,28 @@ if (${TARGET} STREQUAL NEOVERSEV1) endif() endif() + if ((${TARGET} STREQUAL RISCV64_ZVL128B) OR (${TARGET} STREQUAL RISCV64_ZVL256B)) + set (RISCV64_OPT "rv64imafdcv") + if (BUILD_BFLOAT16) + set (RISCV64_OPT "${RISCV64_OPT}_zvfbfwma") + endif() + if (BUILD_HFLOAT16) + set (RISCV64_OPT "${RISCV64_OPT}_zvfh_zfh") + endif() + if (${TARGET} STREQUAL RISCV64_ZVL256B) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=${RISCV64_OPT}_zvl256b -mabi=lp64d") + endif() + if (${TARGET} STREQUAL RISCV64_ZVL128B) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=${RISCV64_OPT}_zvl128b -mabi=lp64d") + endif() + endif() + if (${TARGET} STREQUAL RISCV64_GENERIC) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=rv64imafdc -mabi=lp64d") + endif() + if (${TARGET} STREQUAL x280) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d") + endif() + endif() if (DEFINED BINARY) @@ -378,10 +456,16 @@ if (USE_OPENMP) if (NOT NOFORTRAN) find_package(OpenMP COMPONENTS Fortran REQUIRED) # Avoid mixed OpenMP linkage - get_target_property(OMP_C_LIB OpenMP::OpenMP_C INTERFACE_LINK_LIBRARIES) - get_target_property(OMP_Fortran_LIB OpenMP::OpenMP_Fortran INTERFACE_LINK_LIBRARIES) - if (NOT OMP_C_LIB STREQUAL OMP_Fortran_LIB) - message(FATAL_ERROR "Multiple OpenMP runtime libraries detected. Mixed OpenMP runtime linkage is dangerous. You may pass -DOpenMP_LANG_LIB_NAMES and -DOpenMP_omp_LIBRARY to manually choose the OpenMP library.") + get_target_property(OMP_C_LIBS OpenMP::OpenMP_C INTERFACE_LINK_LIBRARIES) + get_target_property(OMP_F_LIBS OpenMP::OpenMP_Fortran INTERFACE_LINK_LIBRARIES) + if (NOT OMP_C_LIBS STREQUAL OMP_F_LIBS) + message(NOTICE + "CMake detected different OpenMP libraries for C and Fortran:\n" + "C=${OMP_C_LIBS}\n" + "Fortran=${OMP_F_LIBS}\n" + "In case you encounter issues, please check that this is correct.\n" + "You may pass -DOpenMP__LIB_NAMES and -DOpenMP__LIBRARY to cmake to manually force the OpenMP library." + ) endif() endif () endif () @@ -420,20 +504,32 @@ if (X86_64 OR ${CORE} STREQUAL POWER10 OR ARM64 OR LOONGARCH64) endif () if (ARM64) set(GEMM_GEMV_FORWARD TRUE) + set(SBGEMM_GEMV_FORWARD TRUE) + set(BGEMM_GEMV_FORWARD TRUE) +endif () +if (POWER) + set(GEMM_GEMV_FORWARD TRUE) + set(SBGEMM_GEMV_FORWARD TRUE) +endif () +if (RISCV64) + set(GEMM_GEMV_FORWARD TRUE) endif () -if (GEMM_GEMV_FORWARD AND NOT ONLY_CBLAS) +if (GEMM_GEMV_FORWARD) set(CCOMMON_OPT "${CCOMMON_OPT} -DGEMM_GEMV_FORWARD") endif () -if (GEMM_GEMV_FORWARD_BF16 AND NOT ONLY_CBLAS) - set(CCOMMON_OPT "${CCOMMON_OPT} -DGEMM_GEMV_FORWARD_BF16") +if (SBGEMM_GEMV_FORWARD) + set(CCOMMON_OPT "${CCOMMON_OPT} -DSBGEMM_GEMV_FORWARD") +endif () +if (BGEMM_GEMV_FORWARD) + set(CCOMMON_OPT "${CCOMMON_OPT} -DBGEMM_GEMV_FORWARD") endif () if (SMALL_MATRIX_OPT) set(CCOMMON_OPT "${CCOMMON_OPT} -DSMALL_MATRIX_OPT") endif () if (DYNAMIC_ARCH) - if (X86 OR X86_64 OR ARM64 OR POWER OR RISCV64 OR LOONGARCH64) + if (X86 OR X86_64 OR ARM64 OR POWER OR RISCV64 OR LOONGARCH64 OR ZARCH) set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") if (DYNAMIC_OLDER) set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") @@ -640,6 +736,9 @@ endif() if (BUILD_BFLOAT16) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_BFLOAT16") endif() +if (BUILD_HFLOAT16) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_HFLOAT16") +endif() if(NOT MSVC) set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}") endif() @@ -647,14 +746,14 @@ endif() set(PFLAGS "${PFLAGS} ${CCOMMON_OPT} -I${TOPDIR} -DPROFILE ${COMMON_PROF}") if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release") -if ("${F_COMPILER}" STREQUAL "FLANG") -if (${CMAKE_Fortran_COMPILER_VERSION} VERSION_LESS_EQUAL 3) - set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -fno-unroll-loops") -endif () -endif () -if (ARM64 AND CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*" AND CMAKE_SYSTEM_NAME STREQUAL "Windows") - set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -O2") -endif () + if ("${F_COMPILER}" STREQUAL "FLANG") + if (${CMAKE_Fortran_COMPILER_VERSION} VERSION_LESS_EQUAL 3) + set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -fno-unroll-loops") + endif () + endif () + if (ARM64 AND CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*" AND CMAKE_SYSTEM_NAME STREQUAL "Windows") + set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -O2") + endif () endif () diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index dd0dfab637..3d987c3062 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -50,6 +50,9 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*") set(LOONGARCH64 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64.*") set(RISCV64 1) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "s390.*") + set(ZARCH 1) + set(BINARY 64) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*" OR (CMAKE_SYSTEM_NAME MATCHES "Darwin" AND CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*")) if (NOT BINARY) if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") @@ -100,6 +103,8 @@ elseif(X86) set(ARCH "x86") elseif(POWER) set(ARCH "power") +elseif(ZARCH) + set(ARCH "zarch") elseif(MIPS32) set(ARCH "mips") elseif(MIPS64) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index a93f21686f..e717233c1f 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -1,3 +1,31 @@ +############################################################################### +# Copyright (c) 2025, The OpenBLAS Project +# All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# 3. Neither the name of the OpenBLAS project nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +############################################################################### + # Functions to help with the OpenBLAS build # Reads string from getarch into CMake vars. Format of getarch vars is VARNAME=VALUE @@ -347,9 +375,12 @@ function(GenerateNamedObjects sources_in) if (NOT no_float_type) string(SUBSTRING ${float_type} 0 1 float_char) string(TOLOWER ${float_char} float_char) - if (${float_type} STREQUAL "BFLOAT16") - set (float_char "sb") - endif () + if (${float_type} STREQUAL "BFLOAT16" AND NOT "${defines_in}" MATCHES "BGEM") + set (float_char "sb") + endif () + if (${float_type} STREQUAL "HFLOAT16" AND NOT "${defines_in}" MATCHES "HGEM") + set (float_char "sh") + endif () endif () if (NOT name_in) @@ -399,6 +430,9 @@ function(GenerateNamedObjects sources_in) endif () if (${float_type} STREQUAL "BFLOAT16") list(APPEND obj_defines "BFLOAT16") + endif () + if (${float_type} STREQUAL "HFLOAT16") + list(APPEND obj_defines "HFLOAT16") endif () if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") list(APPEND obj_defines "COMPLEX") diff --git a/common.h b/common.h index 8d002c4aa0..efca63e4d9 100644 --- a/common.h +++ b/common.h @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2025 The OpenBLAS Project. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -266,6 +267,14 @@ typedef uint16_t bfloat16; #define BFLOAT16CONVERSION 1 #endif +#ifdef BUILD_HFLOAT16 + #ifndef hfloat16 + typedef _Float16 hfloat16; + #endif +#else + typedef uint16_t hfloat16; +#endif + #ifdef USE64BITINT typedef BLASLONG blasint; #if defined(OS_WINDOWS) && defined(__64BIT__) @@ -309,10 +318,21 @@ typedef int blasint; #elif defined(BFLOAT16) #define IFLOAT bfloat16 #define XFLOAT IFLOAT -#define FLOAT float +#ifdef BGEMM +#define FLOAT bfloat16 +#else +#define FLOAT float +#endif #define SIZE 2 #define BASE_SHIFT 1 #define ZBASE_SHIFT 2 +#elif defined(HFLOAT16) +#define IFLOAT hfloat16 +#define XFLOAT IFLOAT +#define FLOAT float +#define SIZE 2 +#define BASE_SHIFT 1 +#define ZBASE_SHIFT 2 #else #define FLOAT float #define SIZE 4 @@ -342,18 +362,6 @@ typedef int blasint; #define MAX_CPU_NUMBER 2 #endif -#if defined(OS_SUNOS) -#define YIELDING thr_yield() -#endif - -#if defined(OS_WINDOWS) -#if defined(_MSC_VER) && !defined(__clang__) -#define YIELDING YieldProcessor() -#else -#define YIELDING SwitchToThread() -#endif -#endif - #if defined(ARMV7) || defined(ARMV6) || defined(ARMV8) || defined(ARMV5) #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop; \n"); #endif @@ -378,14 +386,26 @@ typedef int blasint; #endif #endif - #ifdef __EMSCRIPTEN__ #define YIELDING #endif +#if defined(_MSC_VER) && !defined(__clang__) +#undef YIELDING // MSVC doesn't support assembly code +#define YIELDING YieldProcessor() +#endif + #ifndef YIELDING +#if defined(OS_SUNOS) +#define YIELDING thr_yield() + +#elif defined(OS_WINDOWS) +#define YIELDING SwitchToThread() + +#else // assume POSIX.1-2008 #define YIELDING sched_yield() #endif +#endif /*** To alloc job_t on heap or stack. @@ -745,7 +765,7 @@ static __inline int readenv_atoi(char *env) { return 0; } #else -#ifdef OS_WINDOWS +#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT) static __inline int readenv_atoi(char *env) { env_var_t p; return readenv(p,env) ? 0 : atoi(p); @@ -761,7 +781,7 @@ static __inline int readenv_atoi(char *env) { #endif #endif -#if !defined(XDOUBLE) || !defined(QUAD_PRECISION) +#if !defined(BFLOAT16) && (!defined(XDOUBLE) || !defined(QUAD_PRECISION)) static __inline void compinv(FLOAT *b, FLOAT ar, FLOAT ai){ diff --git a/common_arm64.h b/common_arm64.h index 5856898a2b..2002de90fd 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -114,9 +114,9 @@ static __inline BLASULONG rpcc(void){ #else BLASULONG ret = 0; blasint shift; - - __asm__ __volatile__ ("isb; mrs %0,cntvct_el0":"=r"(ret)); - __asm__ __volatile__ ("mrs %0,cntfrq_el0; clz %w0, %w0":"=&r"(shift)); + + __asm__ __volatile__ ("isb\n\tmrs %0,cntvct_el0":"=r"(ret)); + __asm__ __volatile__ ("mrs %x0,cntfrq_el0\n\tclz %w0, %w0":"=&r"(shift)); return ret << shift; #endif diff --git a/common_b.h b/common_b.h new file mode 100644 index 0000000000..1921c3a69d --- /dev/null +++ b/common_b.h @@ -0,0 +1,100 @@ +/*************************************************************************** + * Copyright (c) 2025, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#ifndef COMMON_B_H +#define COMMON_B_H + +#ifndef DYNAMIC_ARCH +#define BGEMV_N_K bgemv_n +#define BGEMV_T_K bgemv_t + +#define BSCAL_K bscal_k + +#define BGEMM_ONCOPY bgemm_oncopy +#define BGEMM_OTCOPY bgemm_otcopy + +#if BGEMM_DEFAULT_UNROLL_M == BGEMM_DEFAULT_UNROLL_N +#define BGEMM_INCOPY bgemm_oncopy +#define BGEMM_ITCOPY bgemm_otcopy +#else +#define BGEMM_INCOPY bgemm_incopy +#define BGEMM_ITCOPY bgemm_itcopy +#endif + +#define BGEMM_BETA bgemm_beta +#define BGEMM_KERNEL bgemm_kernel + +#else +#define BGEMV_N_K gotoblas->bgemv_n +#define BGEMV_T_K gotoblas->bgemv_t + +#define BSCAL_K gotoblas->bscal_k + +#define BGEMM_ONCOPY gotoblas->bgemm_oncopy +#define BGEMM_OTCOPY gotoblas->bgemm_otcopy +#define BGEMM_INCOPY gotoblas->bgemm_incopy +#define BGEMM_ITCOPY gotoblas->bgemm_itcopy +#define BGEMM_BETA gotoblas->bgemm_beta +#define BGEMM_KERNEL gotoblas->bgemm_kernel + +#endif + +#define BGEMM_NN bgemm_nn +#define BGEMM_CN bgemm_tn +#define BGEMM_TN bgemm_tn +#define BGEMM_NC bgemm_nt +#define BGEMM_NT bgemm_nt +#define BGEMM_CC bgemm_tt +#define BGEMM_CT bgemm_tt +#define BGEMM_TC bgemm_tt +#define BGEMM_TT bgemm_tt +#define BGEMM_NR bgemm_nn +#define BGEMM_TR bgemm_tn +#define BGEMM_CR bgemm_tn +#define BGEMM_RN bgemm_nn +#define BGEMM_RT bgemm_nt +#define BGEMM_RC bgemm_nt +#define BGEMM_RR bgemm_nn + +#define BGEMM_THREAD_NN bgemm_thread_nn +#define BGEMM_THREAD_CN bgemm_thread_tn +#define BGEMM_THREAD_TN bgemm_thread_tn +#define BGEMM_THREAD_NC bgemm_thread_nt +#define BGEMM_THREAD_NT bgemm_thread_nt +#define BGEMM_THREAD_CC bgemm_thread_tt +#define BGEMM_THREAD_CT bgemm_thread_tt +#define BGEMM_THREAD_TC bgemm_thread_tt +#define BGEMM_THREAD_TT bgemm_thread_tt +#define BGEMM_THREAD_NR bgemm_thread_nn +#define BGEMM_THREAD_TR bgemm_thread_tn +#define BGEMM_THREAD_CR bgemm_thread_tn +#define BGEMM_THREAD_RN bgemm_thread_nn +#define BGEMM_THREAD_RT bgemm_thread_nt +#define BGEMM_THREAD_RC bgemm_thread_nt +#define BGEMM_THREAD_RR bgemm_thread_nn +#endif diff --git a/common_interface.h b/common_interface.h index efd3c6649d..380ce8d081 100644 --- a/common_interface.h +++ b/common_interface.h @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2025 The OpenBLAS Project. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -59,6 +60,7 @@ double BLASFUNC(dsdot) (blasint *, float *, blasint *, float *, blasint *); double BLASFUNC(ddot) (blasint *, double *, blasint *, double *, blasint *); xdouble BLASFUNC(qdot) (blasint *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(bscal) (blasint *, bfloat16 *, bfloat16 *, blasint *); float BLASFUNC(sbdot) (blasint *, bfloat16 *, blasint *, bfloat16 *, blasint *); void BLASFUNC(sbstobf16) (blasint *, float *, blasint *, bfloat16 *, blasint *); void BLASFUNC(sbdtobf16) (blasint *, double *, blasint *, bfloat16 *, blasint *); @@ -255,8 +257,12 @@ void BLASFUNC(xgeru)(blasint *, blasint *, xdouble *, xdouble *, blasint *, void BLASFUNC(xgerc)(blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(bgemv)(char *, blasint *, blasint *, bfloat16 *, bfloat16 *, blasint *, + bfloat16 *, blasint *, bfloat16 *, bfloat16 *, blasint *); void BLASFUNC(sbgemv)(char *, blasint *, blasint *, float *, bfloat16 *, blasint *, bfloat16 *, blasint *, float *, float *, blasint *); +void BLASFUNC(shgemv)(char *, blasint *, blasint *, float *, hfloat16 *, blasint *, + hfloat16 *, blasint *, float *, float *, blasint *); void BLASFUNC(sgemv)(char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(dgemv)(char *, blasint *, blasint *, double *, double *, blasint *, @@ -481,6 +487,10 @@ void BLASFUNC(xhbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint /* Level 3 routines */ +void BLASFUNC(shgemm)(char *, char *, blasint *, blasint *, blasint *, float *, + hfloat16 *, blasint *, hfloat16 *, blasint *, float *, float *, blasint *); +void BLASFUNC(bgemm)(char *, char *, blasint *, blasint *, blasint *, bfloat16 *, + bfloat16 *, blasint *, bfloat16 *, blasint *, bfloat16 *, bfloat16 *, blasint *); void BLASFUNC(sbgemm)(char *, char *, blasint *, blasint *, blasint *, float *, bfloat16 *, blasint *, bfloat16 *, blasint *, float *, float *, blasint *); void BLASFUNC(sgemm)(char *, char *, blasint *, blasint *, blasint *, float *, diff --git a/common_lapack.h b/common_lapack.h index f9c36646ad..2151b88ac0 100644 --- a/common_lapack.h +++ b/common_lapack.h @@ -439,4 +439,9 @@ blasint xtrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdo blasint xtrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint xtrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint slaed3_single(blasint *, blasint *, blasint *, float *, float *, blasint *, float *, float *, float *, blasint *, blasint *, float *, float *, blasint *); +blasint dlaed3_single(blasint *, blasint *, blasint *, double *, double *, blasint *, double *, double *, double *, blasint *, blasint *, double *, double *, blasint *); +blasint slaed3_parallel(blasint *, blasint *, blasint *, float *, float *, blasint *, float *, float *, float *, blasint *, blasint *, float *, float *, blasint *); +blasint dlaed3_parallel(blasint *, blasint *, blasint *, double *, double *, blasint *, double *, double *, double *, blasint *, blasint *, double *, double *, blasint *); + #endif diff --git a/common_level1.h b/common_level1.h index 85b39f7a7c..7ab45a472b 100644 --- a/common_level1.h +++ b/common_level1.h @@ -1,4 +1,5 @@ /*********************************************************************/ +/* Copyright 2025 The OpenBLAS Project. */ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ @@ -169,6 +170,9 @@ BLASLONG icmin_k(BLASLONG, float *, BLASLONG); BLASLONG izmin_k(BLASLONG, double *, BLASLONG); BLASLONG ixmin_k(BLASLONG, xdouble *, BLASLONG); + +int bscal_k(BLASLONG, BLASLONG, BLASLONG, bfloat16, + bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG); int sscal_k(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int dscal_k(BLASLONG, BLASLONG, BLASLONG, double, diff --git a/common_level2.h b/common_level2.h index 9a5ebb4d9b..492787cf70 100644 --- a/common_level2.h +++ b/common_level2.h @@ -1,4 +1,5 @@ /*********************************************************************/ +/* Copyright 2025 The OpenBLAS Project */ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ @@ -44,10 +45,19 @@ extern "C" { #endif + +int bgemv_n(BLASLONG, BLASLONG, bfloat16, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, bfloat16, bfloat16 *, BLASLONG); +int bgemv_t(BLASLONG, BLASLONG, bfloat16, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, bfloat16, bfloat16 *, BLASLONG); +int bgemv_thread_n(BLASLONG, BLASLONG, bfloat16, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, bfloat16, bfloat16 *, BLASLONG, int); +int bgemv_thread_t(BLASLONG, BLASLONG, bfloat16, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, bfloat16, bfloat16 *, BLASLONG, int); int sbgemv_n(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG); int sbgemv_t(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG); int sbgemv_thread_n(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG, int); int sbgemv_thread_t(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG, int); +int shgemv_n(BLASLONG, BLASLONG, float, hfloat16 *, BLASLONG, hfloat16 *, BLASLONG, float, float *, BLASLONG); +int shgemv_t(BLASLONG, BLASLONG, float, hfloat16 *, BLASLONG, hfloat16 *, BLASLONG, float, float *, BLASLONG); +int shgemv_thread_n(BLASLONG, BLASLONG, float, hfloat16 *, BLASLONG, hfloat16 *, BLASLONG, float, float *, BLASLONG, int); +int shgemv_thread_t(BLASLONG, BLASLONG, float, hfloat16 *, BLASLONG, hfloat16 *, BLASLONG, float, float *, BLASLONG, int); int sger_k (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int dger_k (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int qger_k (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); diff --git a/common_level3.h b/common_level3.h index d370c1f96a..64d206a6d1 100644 --- a/common_level3.h +++ b/common_level3.h @@ -52,9 +52,95 @@ void sgemm_direct(BLASLONG M, BLASLONG N, BLASLONG K, float * B, BLASLONG strideB, float * R, BLASLONG strideR); -int sgemm_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K); +void sgemm_direct_alpha_beta(BLASLONG M, BLASLONG N, BLASLONG K, + float alpha, + float * A, BLASLONG strideA, + float * B, BLASLONG strideB, + float beta, + float * R, BLASLONG strideR); + +void ssymm_direct_alpha_betaLU(BLASLONG M, BLASLONG N, + float alpha, + float * A, BLASLONG strideA, + float * B, BLASLONG strideB, + float beta, + float * R, BLASLONG strideR); +void ssymm_direct_alpha_betaLL(BLASLONG M, BLASLONG N, + float alpha, + float * A, BLASLONG strideA, + float * B, BLASLONG strideB, + float beta, + float * R, BLASLONG strideR); + +void strmm_direct_LNUN(BLASLONG M, BLASLONG N, + float alpha, + float * A, BLASLONG strideA, + float * B, BLASLONG strideB); +void strmm_direct_LNLN(BLASLONG M, BLASLONG N, + float alpha, + float * A, BLASLONG strideA, + float * B, BLASLONG strideB); +void strmm_direct_LTUN(BLASLONG M, BLASLONG N, + float alpha, + float * A, BLASLONG strideA, + float * B, BLASLONG strideB); +void strmm_direct_LTLN(BLASLONG M, BLASLONG N, + float alpha, + float * A, BLASLONG strideA, + float * B, BLASLONG strideB); +void ssyrk_direct_alpha_betaUN(BLASLONG N, BLASLONG K, + float alpha, + float * A, BLASLONG strideA, + float beta, + float * C, BLASLONG strideC); +void ssyrk_direct_alpha_betaUT(BLASLONG N, BLASLONG K, + float alpha, + float * A, BLASLONG strideA, + float beta, + float * C, BLASLONG strideC); +void ssyrk_direct_alpha_betaLN(BLASLONG N, BLASLONG K, + float alpha, + float * A, BLASLONG strideA, + float beta, + float * C, BLASLONG strideC); +void ssyrk_direct_alpha_betaLT(BLASLONG N, BLASLONG K, + float alpha, + float * A, BLASLONG strideA, + float beta, + float * C, BLASLONG strideC); +void ssyr2k_direct_alpha_betaUN(BLASLONG N, BLASLONG K, + float alpha, + float * A, BLASLONG strideA, + float * B, BLASLONG strideB, + float beta, + float * R, BLASLONG strideR); +void ssyr2k_direct_alpha_betaUT(BLASLONG N, BLASLONG K, + float alpha, + float * A, BLASLONG strideA, + float * B, BLASLONG strideB, + float beta, + float * R, BLASLONG strideR); +void ssyr2k_direct_alpha_betaLN(BLASLONG N, BLASLONG K, + float alpha, + float * A, BLASLONG strideA, + float * B, BLASLONG strideB, + float beta, + float * R, BLASLONG strideR); +void ssyr2k_direct_alpha_betaLT(BLASLONG N, BLASLONG K, + float alpha, + float * A, BLASLONG strideA, + float * B, BLASLONG strideB, + float beta, + float * R, BLASLONG strideR); + +int sgemm_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K); + +int shgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, + hfloat16 *, BLASLONG, hfloat16 *, BLASLONG, float *, BLASLONG); +int bgemm_beta(BLASLONG, BLASLONG, BLASLONG, bfloat16, + bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG); int sbgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, @@ -78,6 +164,14 @@ int xgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); #endif +int shgemm_incopy(BLASLONG m, BLASLONG n, hfloat16 *a, BLASLONG lda, hfloat16 *b); +int shgemm_itcopy(BLASLONG m, BLASLONG n, hfloat16 *a, BLASLONG lda, hfloat16 *b); +int shgemm_oncopy(BLASLONG m, BLASLONG n, hfloat16 *a, BLASLONG lda, hfloat16 *b); +int shgemm_otcopy(BLASLONG m, BLASLONG n, hfloat16 *a, BLASLONG lda, hfloat16 *b); +int bgemm_incopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); +int bgemm_itcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); +int bgemm_oncopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); +int bgemm_otcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); int sbgemm_incopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); int sbgemm_itcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); int sbgemm_oncopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); @@ -505,6 +599,8 @@ int xher2k_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdoubl int xher2k_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); int xher2k_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); +int shgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, hfloat16 *, hfloat16 *, float *, BLASLONG); +int bgemm_kernel(BLASLONG, BLASLONG, BLASLONG, bfloat16, bfloat16 *, bfloat16 *, bfloat16 *, BLASLONG); int sbgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); int sgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); int dgemm_kernel(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); @@ -657,6 +753,16 @@ int cgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float int zgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); int xgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); +int shgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, hfloat16 *, hfloat16 *, BLASLONG); +int shgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, hfloat16 *, hfloat16 *, BLASLONG); +int shgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, hfloat16 *, hfloat16 *, BLASLONG); +int shgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, hfloat16 *, hfloat16 *, BLASLONG); + +int bgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int bgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int bgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int bgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); + int sbgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); int sbgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); int sbgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); @@ -754,6 +860,16 @@ int xgemm_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLON int xgemm_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); #endif +int shgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, hfloat16 *, hfloat16 *, BLASLONG); +int shgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, hfloat16 *, hfloat16 *, BLASLONG); +int shgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, hfloat16 *, hfloat16 *, BLASLONG); +int shgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, hfloat16 *, hfloat16 *, BLASLONG); + +int bgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int bgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int bgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int bgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); + int sbgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); int sbgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); int sbgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); @@ -1944,6 +2060,7 @@ int dgemm_batch_thread(blas_arg_t * queue, BLASLONG nums); int cgemm_batch_thread(blas_arg_t * queue, BLASLONG nums); int zgemm_batch_thread(blas_arg_t * queue, BLASLONG nums); int sbgemm_batch_thread(blas_arg_t * queue, BLASLONG nums); +// int shgemm_batch_thread(blas_arg_t * queue, BLASLONG nums); #ifdef __CUDACC__ } diff --git a/common_macro.h b/common_macro.h index 820cb472a6..745643fa89 100644 --- a/common_macro.h +++ b/common_macro.h @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2025 The OpenBLAS Project. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -39,6 +40,8 @@ #ifndef COMMON_MACRO #define COMMON_MACRO +#include "common_sh.h" +#include "common_b.h" #include "common_sb.h" #include "common_s.h" #include "common_d.h" @@ -656,15 +659,111 @@ #define GEMM_SMALL_KERNEL_B0_NT DGEMM_SMALL_KERNEL_B0_NT #define GEMM_SMALL_KERNEL_B0_TN DGEMM_SMALL_KERNEL_B0_TN #define GEMM_SMALL_KERNEL_B0_TT DGEMM_SMALL_KERNEL_B0_TT +#elif defined(HFLOAT16) +#define GEMM_BETA SHGEMM_BETA +#define GEMM_KERNEL_N SHGEMM_KERNEL +#define GEMM_KERNEL_L SHGEMM_KERNEL +#define GEMM_KERNEL_R SHGEMM_KERNEL +#define GEMM_KERNEL_B SHGEMM_KERNEL +#define GEMM_NN SHGEMM_NN +#define GEMM_CN SHGEMM_TN +#define GEMM_TN SHGEMM_TN +#define GEMM_NC SHGEMM_NT +#define GEMM_NT SHGEMM_NT +#define GEMM_CC SHGEMM_TT +#define GEMM_CT SHGEMM_TT +#define GEMM_TC SHGEMM_TT +#define GEMM_TT SHGEMM_TT +#define GEMM_NR SHGEMM_NN +#define GEMM_TR SHGEMM_TN +#define GEMM_CR SHGEMM_TN +#define GEMM_RN SHGEMM_NN +#define GEMM_RT SHGEMM_NT +#define GEMM_RC SHGEMM_NT +#define GEMM_RR SHGEMM_NN +#define GEMM_ONCOPY SHGEMM_ONCOPY +#define GEMM_OTCOPY SHGEMM_OTCOPY +#define GEMM_INCOPY SHGEMM_INCOPY +#define GEMM_ITCOPY SHGEMM_ITCOPY + +#define GEMM_THREAD_NN SHGEMM_THREAD_NN +#define GEMM_THREAD_CN SHGEMM_THREAD_TN +#define GEMM_THREAD_TN SHGEMM_THREAD_TN +#define GEMM_THREAD_NC SHGEMM_THREAD_NT +#define GEMM_THREAD_NT SHGEMM_THREAD_NT +#define GEMM_THREAD_CC SHGEMM_THREAD_TT +#define GEMM_THREAD_CT SHGEMM_THREAD_TT +#define GEMM_THREAD_TC SHGEMM_THREAD_TT +#define GEMM_THREAD_TT SHGEMM_THREAD_TT +#define GEMM_THREAD_NR SHGEMM_THREAD_NN +#define GEMM_THREAD_TR SHGEMM_THREAD_TN +#define GEMM_THREAD_CR SHGEMM_THREAD_TN +#define GEMM_THREAD_RN SHGEMM_THREAD_NN +#define GEMM_THREAD_RT SHGEMM_THREAD_NT +#define GEMM_THREAD_RC SHGEMM_THREAD_NT +#define GEMM_THREAD_RR SHGEMM_THREAD_NN + +#define SCAL_K SSCAL_K +#define GEMV_N SHGEMV_N_K +#define GEMV_T SHGEMV_T_K + +#elif defined(BFLOAT16) && defined(BGEMM) +#define SCAL_K BSCAL_K + +#define GEMV_N BGEMV_N_K +#define GEMV_T BGEMV_T_K + +#define GEMM_BETA BGEMM_BETA +#define GEMM_KERNEL_N BGEMM_KERNEL +#define GEMM_KERNEL_L BGEMM_KERNEL +#define GEMM_KERNEL_R BGEMM_KERNEL +#define GEMM_KERNEL_B BGEMM_KERNEL + +#define GEMM_NN BGEMM_NN +#define GEMM_CN BGEMM_TN +#define GEMM_TN BGEMM_TN +#define GEMM_NC BGEMM_NT +#define GEMM_NT BGEMM_NT +#define GEMM_CC BGEMM_TT +#define GEMM_CT BGEMM_TT +#define GEMM_TC BGEMM_TT +#define GEMM_TT BGEMM_TT +#define GEMM_NR BGEMM_NN +#define GEMM_TR BGEMM_TN +#define GEMM_CR BGEMM_TN +#define GEMM_RN BGEMM_NN +#define GEMM_RT BGEMM_NT +#define GEMM_RC BGEMM_NT +#define GEMM_RR BGEMM_NN +#define GEMM_ONCOPY BGEMM_ONCOPY +#define GEMM_OTCOPY BGEMM_OTCOPY +#define GEMM_INCOPY BGEMM_INCOPY +#define GEMM_ITCOPY BGEMM_ITCOPY + +#define GEMM_THREAD_NN BGEMM_THREAD_NN +#define GEMM_THREAD_CN BGEMM_THREAD_TN +#define GEMM_THREAD_TN BGEMM_THREAD_TN +#define GEMM_THREAD_NC BGEMM_THREAD_NT +#define GEMM_THREAD_NT BGEMM_THREAD_NT +#define GEMM_THREAD_CC BGEMM_THREAD_TT +#define GEMM_THREAD_CT BGEMM_THREAD_TT +#define GEMM_THREAD_TC BGEMM_THREAD_TT +#define GEMM_THREAD_TT BGEMM_THREAD_TT +#define GEMM_THREAD_NR BGEMM_THREAD_NN +#define GEMM_THREAD_TR BGEMM_THREAD_TN +#define GEMM_THREAD_CR BGEMM_THREAD_TN +#define GEMM_THREAD_RN BGEMM_THREAD_NN +#define GEMM_THREAD_RT BGEMM_THREAD_NT +#define GEMM_THREAD_RC BGEMM_THREAD_NT +#define GEMM_THREAD_RR BGEMM_THREAD_NN #elif defined(BFLOAT16) - #define D_TO_BF16_K SBDTOBF16_K #define D_BF16_TO_K DBF16TOD_K #define S_TO_BF16_K SBSTOBF16_K #define S_BF16_TO_K SBF16TOS_K -#define SBGEMV_N SBGEMV_N_K -#define SBGEMV_T SBGEMV_T_K +#define GEMV_N SBGEMV_N_K +#define GEMV_T SBGEMV_T_K #define AMAX_K SAMAX_K #define AMIN_K SAMIN_K @@ -682,8 +781,6 @@ #define AXPYC_K SAXPYC_K #define AXPBY_K SAXPBY_K #define SCAL_K SSCAL_K -#define GEMV_N SGEMV_N -#define GEMV_T SGEMV_T #define SYMV_U SSYMV_U #define SYMV_L SSYMV_L #define GERU_K SGERU_K @@ -2618,6 +2715,9 @@ || defined(ARCH_LOONGARCH64) || defined(ARCH_E2K) || defined(ARCH_ALPHA)) extern BLASLONG gemm_offset_a; extern BLASLONG gemm_offset_b; +extern BLASLONG bgemm_p; +extern BLASLONG bgemm_q; +extern BLASLONG bgemm_r; extern BLASLONG sbgemm_p; extern BLASLONG sbgemm_q; extern BLASLONG sbgemm_r; @@ -2990,6 +3090,8 @@ typedef struct { #define NEG_TCOPY DNEG_TCOPY #define LARF_L DLARF_L #define LARF_R DLARF_R +#define LAED3_SINGLE dlaed3_single +#define LAED3_PARALLEL dlaed3_parallel #else #define GETF2 SGETF2 #define GETRF SGETRF @@ -3011,6 +3113,8 @@ typedef struct { #define NEG_TCOPY SNEG_TCOPY #define LARF_L SLARF_L #define LARF_R SLARF_R +#define LAED3_SINGLE slaed3_single +#define LAED3_PARALLEL slaed3_parallel #endif #else #ifdef XDOUBLE diff --git a/common_param.h b/common_param.h index 2d771a27da..9e5edbb816 100644 --- a/common_param.h +++ b/common_param.h @@ -1,6 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ -/* Copyright 2023 The OpenBLAS Project. */ +/* Copyright 2023, 2025 The OpenBLAS Project. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -48,8 +48,28 @@ typedef struct { int dtb_entries; int switch_ratio; int offsetA, offsetB, align; +#if BUILD_HFLOAT16 == 1 +int shgemm_p, shgemm_q, shgemm_r; +int shgemm_unroll_m, shgemm_unroll_n, shgemm_unroll_mn; + +int (*shgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, hfloat16 *, hfloat16 *, float *, BLASLONG); +int (*shgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, hfloat16 *, BLASLONG, hfloat16 *, BLASLONG, float *, BLASLONG); + +int (*shgemm_incopy )(BLASLONG, BLASLONG, hfloat16 *, BLASLONG, hfloat16 *); +int (*shgemm_itcopy )(BLASLONG, BLASLONG, hfloat16 *, BLASLONG, hfloat16 *); +int (*shgemm_oncopy )(BLASLONG, BLASLONG, hfloat16 *, BLASLONG, hfloat16 *); +int (*shgemm_otcopy )(BLASLONG, BLASLONG, hfloat16 *, BLASLONG, hfloat16 *); + +int (*shgemv_n) (BLASLONG, BLASLONG, float, hfloat16 *, BLASLONG, hfloat16 *, BLASLONG, float, float *, BLASLONG); +int (*shgemv_t) (BLASLONG, BLASLONG, float, hfloat16 *, BLASLONG, hfloat16 *, BLASLONG, float, float *, BLASLONG); +#endif + #if BUILD_BFLOAT16 == 1 + int bgemm_p, bgemm_q, bgemm_r; + int bgemm_unroll_m, bgemm_unroll_n, bgemm_unroll_mn; + int bgemm_align_k; + int sbgemm_p, sbgemm_q, sbgemm_r; int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn; int sbgemm_align_k; @@ -64,10 +84,10 @@ typedef struct { float (*sbamin_k) (BLASLONG, float *, BLASLONG); float (*sbmax_k) (BLASLONG, float *, BLASLONG); float (*sbmin_k) (BLASLONG, float *, BLASLONG); -BLASLONG (*isbamax_k)(BLASLONG, float *, BLASLONG); -BLASLONG (*isbamin_k)(BLASLONG, float *, BLASLONG); -BLASLONG (*isbmax_k) (BLASLONG, float *, BLASLONG); -BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG); + BLASLONG (*isbamax_k)(BLASLONG, float *, BLASLONG); + BLASLONG (*isbamin_k)(BLASLONG, float *, BLASLONG); + BLASLONG (*isbmax_k) (BLASLONG, float *, BLASLONG); + BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG); float (*sbnrm2_k) (BLASLONG, float *, BLASLONG); float (*sbasum_k) (BLASLONG, float *, BLASLONG); @@ -79,10 +99,14 @@ BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG); int (*sbrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); int (*sbrotm_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*bscal_k) (BLASLONG, BLASLONG, BLASLONG, bfloat16, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG); int (*sbaxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*sbscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*sbswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*bgemv_n) (BLASLONG, BLASLONG, bfloat16, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, bfloat16, bfloat16 *, BLASLONG); + int (*bgemv_t) (BLASLONG, BLASLONG, bfloat16, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, bfloat16, bfloat16 *, BLASLONG); + int (*sbgemv_n) (BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG); int (*sbgemv_t) (BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG); int (*sbger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); @@ -90,6 +114,14 @@ BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG); int (*sbsymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sbsymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*bgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, bfloat16, bfloat16 *, bfloat16 *, bfloat16 *, BLASLONG); + int (*bgemm_beta )(BLASLONG, BLASLONG, BLASLONG, bfloat16, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG); + + int (*bgemm_incopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*bgemm_itcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*bgemm_oncopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*bgemm_otcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*sbgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); int (*sbgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); @@ -180,12 +212,12 @@ BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG); #endif #if (BUILD_SINGLE==1) || (BUILD_DOUBLE ==1) || (BUILD_COMPLEX==1) -BLASLONG (*isamax_k)(BLASLONG, float *, BLASLONG); + BLASLONG (*isamax_k)(BLASLONG, float *, BLASLONG); #endif #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) -BLASLONG (*isamin_k)(BLASLONG, float *, BLASLONG); -BLASLONG (*ismax_k) (BLASLONG, float *, BLASLONG); -BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); + BLASLONG (*isamin_k)(BLASLONG, float *, BLASLONG); + BLASLONG (*ismax_k) (BLASLONG, float *, BLASLONG); + BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); float (*snrm2_k) (BLASLONG, float *, BLASLONG); float (*sasum_k) (BLASLONG, float *, BLASLONG); #endif @@ -225,6 +257,22 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); #endif #ifdef ARCH_ARM64 void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); + int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); + void (*sgemm_direct_alpha_beta) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float, float * , BLASLONG); + void (*ssymm_direct_alpha_betaLU) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float, float * , BLASLONG); + void (*ssymm_direct_alpha_betaLL) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float, float * , BLASLONG); + void (*strmm_direct_LNUN) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG); + void (*strmm_direct_LNLN) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG); + void (*strmm_direct_LTUN) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG); + void (*strmm_direct_LTLN) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG); + void (*ssyrk_direct_alpha_betaUN) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); + void (*ssyrk_direct_alpha_betaUT) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); + void (*ssyrk_direct_alpha_betaLN) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); + void (*ssyrk_direct_alpha_betaLT) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); + void (*ssyr2k_direct_alpha_betaUN) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float, float *, BLASLONG); + void (*ssyr2k_direct_alpha_betaUT) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float, float *, BLASLONG); + void (*ssyr2k_direct_alpha_betaLN) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float, float *, BLASLONG); + void (*ssyr2k_direct_alpha_betaLT) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float, float *, BLASLONG); #endif @@ -316,10 +364,10 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); double (*damin_k) (BLASLONG, double *, BLASLONG); double (*dmax_k) (BLASLONG, double *, BLASLONG); double (*dmin_k) (BLASLONG, double *, BLASLONG); -BLASLONG (*idamax_k)(BLASLONG, double *, BLASLONG); -BLASLONG (*idamin_k)(BLASLONG, double *, BLASLONG); -BLASLONG (*idmax_k) (BLASLONG, double *, BLASLONG); -BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); + BLASLONG (*idamax_k)(BLASLONG, double *, BLASLONG); + BLASLONG (*idamin_k)(BLASLONG, double *, BLASLONG); + BLASLONG (*idmax_k) (BLASLONG, double *, BLASLONG); + BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); double (*dnrm2_k) (BLASLONG, double *, BLASLONG); double (*dasum_k) (BLASLONG, double *, BLASLONG); @@ -435,10 +483,10 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); xdouble (*qamin_k) (BLASLONG, xdouble *, BLASLONG); xdouble (*qmax_k) (BLASLONG, xdouble *, BLASLONG); xdouble (*qmin_k) (BLASLONG, xdouble *, BLASLONG); -BLASLONG (*iqamax_k)(BLASLONG, xdouble *, BLASLONG); -BLASLONG (*iqamin_k)(BLASLONG, xdouble *, BLASLONG); -BLASLONG (*iqmax_k) (BLASLONG, xdouble *, BLASLONG); -BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); + BLASLONG (*iqamax_k)(BLASLONG, xdouble *, BLASLONG); + BLASLONG (*iqamin_k)(BLASLONG, xdouble *, BLASLONG); + BLASLONG (*iqmax_k) (BLASLONG, xdouble *, BLASLONG); + BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG); xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG); @@ -528,8 +576,8 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); float (*camax_k) (BLASLONG, float *, BLASLONG); float (*camin_k) (BLASLONG, float *, BLASLONG); -BLASLONG (*icamax_k)(BLASLONG, float *, BLASLONG); -BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); + BLASLONG (*icamax_k)(BLASLONG, float *, BLASLONG); + BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); float (*cnrm2_k) (BLASLONG, float *, BLASLONG); float (*casum_k) (BLASLONG, float *, BLASLONG); @@ -739,8 +787,8 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); double (*zamax_k) (BLASLONG, double *, BLASLONG); double (*zamin_k) (BLASLONG, double *, BLASLONG); -BLASLONG (*izamax_k)(BLASLONG, double *, BLASLONG); -BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); + BLASLONG (*izamax_k)(BLASLONG, double *, BLASLONG); + BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); double (*znrm2_k) (BLASLONG, double *, BLASLONG); double (*zasum_k) (BLASLONG, double *, BLASLONG); @@ -950,8 +998,8 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); xdouble (*xamax_k) (BLASLONG, xdouble *, BLASLONG); xdouble (*xamin_k) (BLASLONG, xdouble *, BLASLONG); -BLASLONG (*ixamax_k)(BLASLONG, xdouble *, BLASLONG); -BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); + BLASLONG (*ixamax_k)(BLASLONG, xdouble *, BLASLONG); + BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG); xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG); @@ -1229,7 +1277,23 @@ extern gotoblas_t *gotoblas; #define HAVE_EX_L2 gotoblas -> exclusive_cache +#if (BUILD_HFLOAT16==1) +#define SHGEMM_P gotoblas -> shgemm_p +#define SHGEMM_Q gotoblas -> shgemm_q +#define SHGEMM_R gotoblas -> shgemm_r +#define SHGEMM_UNROLL_M gotoblas -> shgemm_unroll_m +#define SHGEMM_UNROLL_N gotoblas -> shgemm_unroll_n +#define SHGEMM_UNROLL_MN gotoblas -> shgemm_unroll_mn +#endif + #if (BUILD_BFLOAT16==1) +#define BGEMM_P gotoblas -> bgemm_p +#define BGEMM_Q gotoblas -> bgemm_q +#define BGEMM_R gotoblas -> bgemm_r +#define BGEMM_UNROLL_M gotoblas -> bgemm_unroll_m +#define BGEMM_UNROLL_N gotoblas -> bgemm_unroll_n +#define BGEMM_UNROLL_MN gotoblas -> bgemm_unroll_mn + #define SBGEMM_P gotoblas -> sbgemm_p #define SBGEMM_Q gotoblas -> sbgemm_q #define SBGEMM_R gotoblas -> sbgemm_r @@ -1357,7 +1421,31 @@ extern gotoblas_t *gotoblas; #define HAVE_EX_L2 0 #endif +#if (BUILD_HFLOAT16 == 1) +#define SHGEMM_P SHGEMM_DEFAULT_P +#define SHGEMM_Q SHGEMM_DEFAULT_Q +#define SHGEMM_R SHGEMM_DEFAULT_R +#define SHGEMM_UNROLL_M SHGEMM_DEFAULT_UNROLL_M +#define SHGEMM_UNROLL_N SHGEMM_DEFAULT_UNROLL_N +#ifdef SHGEMM_DEFAULT_UNROLL_MN +#define SHGEMM_UNROLL_MN SHGEMM_DEFAULT_UNROLL_MN +#else +#define SHGEMM_UNROLL_MN MAX((SHGEMM_UNROLL_M), (SHGEMM_UNROLL_N)) +#endif +#endif + #if (BUILD_BFLOAT16 == 1) +#define BGEMM_P BGEMM_DEFAULT_P +#define BGEMM_Q BGEMM_DEFAULT_Q +#define BGEMM_R BGEMM_DEFAULT_R +#define BGEMM_UNROLL_M BGEMM_DEFAULT_UNROLL_M +#define BGEMM_UNROLL_N BGEMM_DEFAULT_UNROLL_N +#ifdef BGEMM_DEFAULT_UNROLL_MN +#define BGEMM_UNROLL_MN BGEMM_DEFAULT_UNROLL_MN +#else +#define BGEMM_UNROLL_MN MAX((BGEMM_UNROLL_M), (BGEMM_UNROLL_N)) +#endif + #define SBGEMM_P SBGEMM_DEFAULT_P #define SBGEMM_Q SBGEMM_DEFAULT_Q #define SBGEMM_R SBGEMM_DEFAULT_R @@ -1478,6 +1566,7 @@ extern gotoblas_t *gotoblas; #endif + #endif #ifndef COMPLEX @@ -1505,6 +1594,30 @@ extern gotoblas_t *gotoblas; #define GEMM_DEFAULT_R DGEMM_DEFAULT_R #define GEMM_DEFAULT_UNROLL_M DGEMM_DEFAULT_UNROLL_M #define GEMM_DEFAULT_UNROLL_N DGEMM_DEFAULT_UNROLL_N +#elif defined(HFLOAT16) +#define GEMM_P SHGEMM_P +#define GEMM_Q SHGEMM_Q +#define GEMM_R SHGEMM_R +#define GEMM_UNROLL_M SHGEMM_UNROLL_M +#define GEMM_UNROLL_N SHGEMM_UNROLL_N +#define GEMM_UNROLL_MN SHGEMM_UNROLL_MN +#define GEMM_DEFAULT_P SHGEMM_DEFAULT_P +#define GEMM_DEFAULT_Q SHGEMM_DEFAULT_Q +#define GEMM_DEFAULT_R SHGEMM_DEFAULT_R +#define GEMM_DEFAULT_UNROLL_M SHGEMM_DEFAULT_UNROLL_M +#define GEMM_DEFAULT_UNROLL_N SHGEMM_DEFAULT_UNROLL_N +#elif defined(BFLOAT16) && defined(BGEMM) +#define GEMM_P BGEMM_P +#define GEMM_Q BGEMM_Q +#define GEMM_R BGEMM_R +#define GEMM_UNROLL_M BGEMM_UNROLL_M +#define GEMM_UNROLL_N BGEMM_UNROLL_N +#define GEMM_UNROLL_MN BGEMM_UNROLL_MN +#define GEMM_DEFAULT_P BGEMM_DEFAULT_P +#define GEMM_DEFAULT_Q BGEMM_DEFAULT_Q +#define GEMM_DEFAULT_R BGEMM_DEFAULT_R +#define GEMM_DEFAULT_UNROLL_M BGEMM_DEFAULT_UNROLL_M +#define GEMM_DEFAULT_UNROLL_N BGEMM_DEFAULT_UNROLL_N #elif defined(BFLOAT16) #define GEMM_P SBGEMM_P #define GEMM_Q SBGEMM_Q diff --git a/common_s.h b/common_s.h index 1dede1e365..e43b74b917 100644 --- a/common_s.h +++ b/common_s.h @@ -49,6 +49,21 @@ #define SGEMM_DIRECT_PERFORMANT sgemm_direct_performant #define SGEMM_DIRECT sgemm_direct +#define SGEMM_DIRECT_ALPHA_BETA sgemm_direct_alpha_beta +#define SSYMM_DIRECT_ALPHA_BETA_LU ssymm_direct_alpha_betaLU +#define SSYMM_DIRECT_ALPHA_BETA_LL ssymm_direct_alpha_betaLL +#define STRMM_DIRECT_LNUN strmm_direct_LNUN +#define STRMM_DIRECT_LNLN strmm_direct_LNLN +#define STRMM_DIRECT_LTUN strmm_direct_LTUN +#define STRMM_DIRECT_LTLN strmm_direct_LTLN +#define SSYRK_DIRECT_ALPHA_BETA_UN ssyrk_direct_alpha_betaUN +#define SSYRK_DIRECT_ALPHA_BETA_UT ssyrk_direct_alpha_betaUT +#define SSYRK_DIRECT_ALPHA_BETA_LN ssyrk_direct_alpha_betaLN +#define SSYRK_DIRECT_ALPHA_BETA_LT ssyrk_direct_alpha_betaLT +#define SSYR2K_DIRECT_ALPHA_BETA_UN ssyr2k_direct_alpha_betaUN +#define SSYR2K_DIRECT_ALPHA_BETA_UT ssyr2k_direct_alpha_betaUT +#define SSYR2K_DIRECT_ALPHA_BETA_LN ssyr2k_direct_alpha_betaLN +#define SSYR2K_DIRECT_ALPHA_BETA_LT ssyr2k_direct_alpha_betaLT #define SGEMM_ONCOPY sgemm_oncopy #define SGEMM_OTCOPY sgemm_otcopy @@ -216,8 +231,23 @@ #define SGEMM_DIRECT_PERFORMANT gotoblas -> sgemm_direct_performant #define SGEMM_DIRECT gotoblas -> sgemm_direct #elif ARCH_ARM64 -#define SGEMM_DIRECT_PERFORMANT sgemm_direct_performant +#define SGEMM_DIRECT_PERFORMANT gotoblas -> sgemm_direct_performant #define SGEMM_DIRECT gotoblas -> sgemm_direct +#define SGEMM_DIRECT_ALPHA_BETA gotoblas -> sgemm_direct_alpha_beta +#define SSYMM_DIRECT_ALPHA_BETA_LU gotoblas -> ssymm_direct_alpha_betaLU +#define SSYMM_DIRECT_ALPHA_BETA_LL gotoblas -> ssymm_direct_alpha_betaLL +#define STRMM_DIRECT_LNUN gotoblas -> strmm_direct_LNUN +#define STRMM_DIRECT_LNLN gotoblas -> strmm_direct_LNLN +#define STRMM_DIRECT_LTUN gotoblas -> strmm_direct_LTUN +#define STRMM_DIRECT_LTLN gotoblas -> strmm_direct_LTLN +#define SSYRK_DIRECT_ALPHA_BETA_UN gotoblas -> ssyrk_direct_alpha_betaUN +#define SSYRK_DIRECT_ALPHA_BETA_UT gotoblas -> ssyrk_direct_alpha_betaUT +#define SSYRK_DIRECT_ALPHA_BETA_LN gotoblas -> ssyrk_direct_alpha_betaLN +#define SSYRK_DIRECT_ALPHA_BETA_LT gotoblas -> ssyrk_direct_alpha_betaLT +#define SSYR2K_DIRECT_ALPHA_BETA_UN gotoblas -> ssyr2k_direct_alpha_betaUN +#define SSYR2K_DIRECT_ALPHA_BETA_UT gotoblas -> ssyr2k_direct_alpha_betaUT +#define SSYR2K_DIRECT_ALPHA_BETA_LN gotoblas -> ssyr2k_direct_alpha_betaLN +#define SSYR2K_DIRECT_ALPHA_BETA_LT gotoblas -> ssyr2k_direct_alpha_betaLT #endif #define SGEMM_ONCOPY gotoblas -> sgemm_oncopy diff --git a/common_sh.h b/common_sh.h new file mode 100644 index 0000000000..99dbb65180 --- /dev/null +++ b/common_sh.h @@ -0,0 +1,107 @@ +/*************************************************************************** + * Copyright (c) 2025, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#ifndef COMMON_SH_H +#define COMMON_SH_H + +#ifndef DYNAMIC_ARCH + +#define SHGEMM_ONCOPY shgemm_oncopy +#define SHGEMM_OTCOPY shgemm_otcopy + +#if SGEMM_DEFAULT_UNROLL_M == SGEMM_DEFAULT_UNROLL_N +#define SHGEMM_INCOPY shgemm_oncopy +#define SHGEMM_ITCOPY shgemm_otcopy +#else +#define SHGEMM_INCOPY shgemm_incopy +#define SHGEMM_ITCOPY shgemm_itcopy +#endif + +#define SHGEMM_BETA shgemm_beta +#define SHGEMM_KERNEL shgemm_kernel + +#define SHGEMV_N_K shgemv_n +#define SHGEMV_T_K shgemv_t + + +#else // #DYNAMIC_ARCH + +#define SHGEMM_ONCOPY gotoblas -> shgemm_oncopy +#define SHGEMM_OTCOPY gotoblas -> shgemm_otcopy +#if SGEMM_DEFAULT_UNROLL_M == SGEMM_DEFAULT_UNROLL_N +#define SHGEMM_INCOPY gotoblas -> shgemm_oncopy +#define SHGEMM_ITCOPY gotoblas -> shgemm_otcopy +#else +#define SHGEMM_INCOPY gotoblas -> shgemm_incopy +#define SHGEMM_ITCOPY gotoblas -> shgemm_itcopy +#endif + +#define SHGEMM_BETA gotoblas -> shgemm_beta +#define SHGEMM_KERNEL gotoblas -> shgemm_kernel + +#define SHGEMV_N_K gotoblas->shgemv_n +#define SHGEMV_T_K gotoblas->shgemv_t + +#endif // #DYNAMIC_ARCH + +#define SHGEMM_NN shgemm_nn +#define SHGEMM_CN shgemm_tn +#define SHGEMM_TN shgemm_tn +#define SHGEMM_NC shgemm_nt +#define SHGEMM_NT shgemm_nt +#define SHGEMM_CC shgemm_tt +#define SHGEMM_CT shgemm_tt +#define SHGEMM_TC shgemm_tt +#define SHGEMM_TT shgemm_tt +#define SHGEMM_NR shgemm_nn +#define SHGEMM_TR shgemm_tn +#define SHGEMM_CR shgemm_tn +#define SHGEMM_RN shgemm_nn +#define SHGEMM_RT shgemm_nt +#define SHGEMM_RC shgemm_nt +#define SHGEMM_RR shgemm_nn + +#define SHGEMM_THREAD_NN shgemm_thread_nn +#define SHGEMM_THREAD_CN shgemm_thread_tn +#define SHGEMM_THREAD_TN shgemm_thread_tn +#define SHGEMM_THREAD_NC shgemm_thread_nt +#define SHGEMM_THREAD_NT shgemm_thread_nt +#define SHGEMM_THREAD_CC shgemm_thread_tt +#define SHGEMM_THREAD_CT shgemm_thread_tt +#define SHGEMM_THREAD_TC shgemm_thread_tt +#define SHGEMM_THREAD_TT shgemm_thread_tt +#define SHGEMM_THREAD_NR shgemm_thread_nn +#define SHGEMM_THREAD_TR shgemm_thread_tn +#define SHGEMM_THREAD_CR shgemm_thread_tn +#define SHGEMM_THREAD_RN shgemm_thread_nn +#define SHGEMM_THREAD_RT shgemm_thread_nt +#define SHGEMM_THREAD_RC shgemm_thread_nt +#define SHGEMM_THREAD_RR shgemm_thread_nn + + +#endif // #COMMON_SH_H \ No newline at end of file diff --git a/cpuid.S b/cpuid.S index 295917bdbe..a8f5f8b573 100644 --- a/cpuid.S +++ b/cpuid.S @@ -66,5 +66,9 @@ _cpuid: #endif #if defined(__ELF__) && defined(__linux__) +#if defined(__arm__) + .section .note.GNU-stack,"",%progbits +#else .section .note.GNU-stack,"",@progbits #endif +#endif diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 2bf93cc87b..06b595e779 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -79,8 +79,10 @@ size_t length64=sizeof(value64); #define CPU_TSV110 9 // Ampere #define CPU_EMAG8180 10 +#define CPU_AMPERE1 25 // Apple #define CPU_VORTEX 13 +#define CPU_VORTEXM4 26 // Fujitsu #define CPU_A64FX 15 // Phytium @@ -111,7 +113,9 @@ static char *cpuname[] = { "CORTEXA710", "FT2000", "CORTEXA76", - "NEOVERSEV2" + "NEOVERSEV2", + "AMPERE1", + "VORTEXM4", }; static char *cpuname_lower[] = { @@ -139,13 +143,17 @@ static char *cpuname_lower[] = { "cortexa710", "ft2000", "cortexa76", - "neoversev2" + "neoversev2", + "ampere1", + "vortexm4" }; static int cpulowperf=0; static int cpumidperf=0; static int cpuhiperf=0; +int aliased = 0; + int get_feature(char *search) { @@ -334,11 +342,15 @@ int detect(void) // Ampere else if (strstr(cpu_implementer, "0x50") && strstr(cpu_part, "0x000")) return CPU_EMAG8180; + else if (strstr(cpu_implementer, "0xc0")) { + if (strstr(cpu_part, "0xac3") || strstr(cpu_part, "0xac4")) + return CPU_AMPERE1; + } // Fujitsu else if (strstr(cpu_implementer, "0x46") && strstr(cpu_part, "0x001")) return CPU_A64FX; // Apple - else if (strstr(cpu_implementer, "0x61") && strstr(cpu_part, "0x022")) + else if (strstr(cpu_implementer, "0x61") /* && strstr(cpu_part, "0x022")*/) return CPU_VORTEX; // Phytium else if (strstr(cpu_implementer, "0x70") && (strstr(cpu_part, "0x660") || strstr(cpu_part, "0x661") @@ -392,7 +404,8 @@ int detect(void) if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1 if (value64 == 3660830781) return CPU_VORTEX; //A15/M2 if (value64 == 2271604202) return CPU_VORTEX; //A16/M3 - if (value64 == 1867590060) return CPU_VORTEX; //M4 + if (value64 == 1867590060) return CPU_VORTEXM4; //M4 + if (value64 == 492472296) return CPU_VORTEXM4; //M5 #else #ifdef OS_WINDOWS HKEY reghandle; @@ -410,7 +423,10 @@ int detect(void) if (errcode != ERROR_SUCCESS) wprintf(L"Error reading cpuname from registry:%x\n",errcode); //wprintf(stderr,L"%s\n",(PWSTR)valstring); RegCloseKey(reghandle); - if (strstr(valstring, "Snapdragon(R) X Elite")) return CPU_NEOVERSEN1; + if (strstr(valstring, "Snapdragon(R) X Elite")) { + aliased = 1; + return CPU_NEOVERSEN1; + } if (strstr(valstring, "Ampere(R) Altra")) return CPU_NEOVERSEN1; if (strstr(valstring, "Snapdragon (TM) 8cx Gen 3")) return CPU_CORTEXX1; if (strstr(valstring, "Snapdragon Compute Platform")) return CPU_CORTEXX1; @@ -533,6 +549,7 @@ void get_cpuconfig(void) break; case CPU_NEOVERSEN1: printf("#define %s\n", cpuname[d]); + if (aliased == 0) { printf("#define L1_CODE_SIZE 65536\n"); printf("#define L1_CODE_LINESIZE 64\n"); printf("#define L1_CODE_ASSOCIATIVE 4\n"); @@ -544,6 +561,23 @@ void get_cpuconfig(void) printf("#define L2_ASSOCIATIVE 8\n"); printf("#define DTB_DEFAULT_ENTRIES 48\n"); printf("#define DTB_SIZE 4096\n"); + } else { + printf("#define L1_CODE_SIZE 196608\n"); + printf("#define L1_CODE_LINESIZE 64\n"); + printf("#define L1_CODE_ASSOCIATIVE 6\n"); + printf("#define L1_DATA_SIZE 98304\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L1_DATA_ASSOCIATIVE 6\n"); + printf("#define L2_SIZE 12582912\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define L2_ASSOCIATIVE 12\n"); + printf("#define ITB_SIZE 4096\n"); + printf("#define ITB_ASSOCIATIVE 8\n"); + printf("#define ITB_DEFAULT_ENTRIES 256\n"); + printf("#define DTB_DEFAULT_ENTRIES 224\n"); + printf("#define DTB_ASSOCIATIVE 7\n"); + printf("#define DTB_SIZE 4096\n"); + } break; case CPU_NEOVERSEV1: @@ -684,6 +718,21 @@ void get_cpuconfig(void) printf("#define DTB_SIZE 4096\n"); break; + case CPU_AMPERE1: + printf("#define %s\n", cpuname[d]); + printf("#define L1_CODE_SIZE 16384\n"); + printf("#define L1_CODE_LINESIZE 64\n"); + printf("#define L1_CODE_ASSOCIATIVE 4\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L1_DATA_ASSOCIATIVE 4\n"); + printf("#define L2_SIZE 2097152\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define L2_ASSOCIATIVE 8\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + break; + case CPU_THUNDERX3T110: printf("#define THUNDERX3T110 \n"); printf("#define L1_CODE_SIZE 65536 \n"); @@ -703,6 +752,29 @@ void get_cpuconfig(void) break; case CPU_VORTEX: printf("#define VORTEX \n"); +#ifdef __APPLE__ + length64 = sizeof(value64); + sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0); + printf("#define L1_CODE_SIZE %lld \n",value64); + length64 = sizeof(value64); + sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0); + printf("#define L1_CODE_LINESIZE %lld \n",value64); + printf("#define L1_DATA_LINESIZE %lld \n",value64); + length64 = sizeof(value64); + sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0); + printf("#define L1_DATA_SIZE %lld \n",value64); + length64 = sizeof(value64); + sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0); + printf("#define L2_SIZE %lld \n",value64); +#endif + printf("#define DTB_DEFAULT_ENTRIES 64 \n"); + printf("#define DTB_SIZE 4096 \n"); + break; + case CPU_VORTEXM4: + printf("#define VORTEXM4 \n"); +#ifdef __clang__ + printf("#define HAVE_SME 1 \n"); +#endif #ifdef __APPLE__ length64 = sizeof(value64); sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0); diff --git a/cpuid_power.c b/cpuid_power.c index 1ced8930a5..2b2c32eeab 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -131,6 +131,7 @@ int detect(void){ if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9; if (!strncasecmp(p, "POWER10", 7)) return CPUTYPE_POWER10; + if (!strncasecmp(p, "POWER11", 7)) return CPUTYPE_POWER10; if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; @@ -171,6 +172,9 @@ int detect(void){ int id; __asm __volatile("mfpvr %0" : "=r"(id)); switch ( id >> 16 ) { + case 0x82: // POWER11 + return CPUTYPE_POWER10; + break; case 0x80: // POWER10 return CPUTYPE_POWER10; break; diff --git a/cpuid_x86.c b/cpuid_x86.c index 1b09c7217c..8e1438e500 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1567,6 +1567,7 @@ int get_cpuname(void){ case 10: case 15: case 14: // Alder Lake N + case 13: // Lunar Lake if(support_avx2()) return CPUTYPE_HASWELL; if(support_avx()) @@ -2412,7 +2413,8 @@ int get_coretype(void){ case 7: // Raptor Lake case 10: case 15: - case 14: // Alder Lake N + case 14: // Alder Lake N + case 13: // Lunar Lake #ifndef NO_AVX2 if(support_avx2()) return CORE_HASWELL; diff --git a/docs/dgemm_snb_1thread.png b/docs/dgemm_snb_1thread.png new file mode 100644 index 0000000000..71211fc8e4 Binary files /dev/null and b/docs/dgemm_snb_1thread.png differ diff --git a/docs/faq.md b/docs/faq.md index 93d76c67fb..de2f54bc38 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -91,7 +91,7 @@ like Intel Haswell. There once was an effort to build an OpenCL implementation t We obtained a performance comparable with Intel MKL that actually outperformed Intel MKL in some cases. Here is the result of the DGEMM subroutine's performance on Intel Core i5-2500K Windows 7 SP1 64-bit: -![Single Thread DGEMM Performance on Intel Desktop Sandy Bridge](http://xianyi.github.com/OpenBLAS/dgemm_snb_1thread.png) +![Single Thread DGEMM Performance on Intel Desktop Sandy Bridge](dgemm_snb_1thread.png)
@@ -220,8 +220,8 @@ lead to compiler error messages about an "ABI change" when compiling AVX512 code ### Building OpenBLAS on POWER fails with IBM XL Trying to compile OpenBLAS with IBM XL ends with error messages about unknown register names -like "vs32". Working around these by using known alternate names for the vector registers only leads to another assembler error about unsupported constraints. This is a known deficiency in the IBM compiler at least up to and including 16.1.0 (and in the POWER version of clang, from which it is derived) - use gcc instead. (See issues #1078 -and #1699 for related discussions) +like "vs32". Working around these by using known alternate names for the vector registers only leads to another assembler error about unsupported constraints. This is a known deficiency in the IBM compiler at least up to and including 16.1.0 (and in the POWER version of clang, from which it is derived) - use gcc instead. (See issues [#1078](https://github.com/OpenMathLib/OpenBLAS/issues/1078) +and [#1699](https://github.com/OpenMathLib/OpenBLAS/issues/1699) for related discussions) ### Replacing system BLAS/updating APT OpenBLAS in Mint/Ubuntu/Debian @@ -268,7 +268,7 @@ path (usually either /usr/local/include, /opt/OpenBLAS/include or whatever you s This is due to different interpretations of the (informal) standard for passing characters as arguments between C and FORTRAN functions. As the method for storing text differs in the two languages, when C calls Fortran the text length is passed as an "invisible" additional parameter. Historically, this has not been required when the text is just a single character, so older code like the Reference-LAPACK bundled with OpenBLAS -does not do it. Recently gcc's checking has changed to require it, but there is no consensus yet if and how the existing LAPACK (and many other codebases) should adapt. (And for actual compilation, gcc has mostly backtracked and provided compatibility options - hence the default build settings in the OpenBLAS Makefiles add -fno-optimize-sibling-calls to the gfortran options to prevent miscompilation with "affected" versions. See ticket 2154 in the issue tracker for more details and links) +does not do it. Recently gcc's checking has changed to require it, but there is no consensus yet if and how the existing LAPACK (and many other codebases) should adapt. (And for actual compilation, gcc has mostly backtracked and provided compatibility options - hence the default build settings in the OpenBLAS Makefiles add -fno-optimize-sibling-calls to the gfortran options to prevent miscompilation with "affected" versions. See ticket [#2154](https://github.com/OpenMathLib/OpenBLAS/issues/2154) in the issue tracker for more details and links)
### Build fails with lots of errors about undefined ?GEMM_UNROLL_M diff --git a/docs/install.md b/docs/install.md index 656c6a1219..5e31b50661 100644 --- a/docs/install.md +++ b/docs/install.md @@ -217,8 +217,11 @@ in this section, since the process for each is quite different. For Visual Studio, you can use CMake to generate Visual Studio solution files; note that you will need at least CMake 3.11 for linking to work correctly). -Note that you need a Fortran compiler if you plan to build and use the LAPACK -functions included with OpenBLAS. The sections below describe using either +Note that you need a Fortran compiler if you plan to build and use the latest version +of the LAPACK functions included with OpenBLAS. (If you do not have a Fortran compiler +installed, you can build an older version of the LAPACK sources that has been converted +to C - but its performance will likely be slower and accuracy may be poorer too.) +The sections below describe using either `flang` as an add-on to clang/LLVM or `gfortran` as part of MinGW for this purpose. If you want to use the Intel Fortran compiler (`ifort` or `ifx`) for this, be sure to also use the Intel C compiler (`icc` or `icx`) for building @@ -226,21 +229,22 @@ the C parts, as the ABI imposed by `ifort` is incompatible with MSVC A fully-optimized OpenBLAS that can be statically or dynamically linked to your application can currently be built for the 64-bit architecture with the LLVM -compiler infrastructure. We're going to use [Miniconda3](https://docs.anaconda.com/miniconda/) +compiler infrastructure. We're going to use [Miniforge3] the pre-configured +and more versatile alternative to [Miniconda](https://docs.anaconda.com/miniconda/) to grab all of the tools we need, since some of them are in an experimental status. Before you begin, you'll need to have Microsoft Visual Studio 2015 or newer installed. -1. Install Miniconda3 for 64-bit Windows using `winget install --id Anaconda.Miniconda3`, - or easily download from [conda.io](https://docs.conda.io/en/latest/miniconda.html). -2. Open the "Anaconda Command Prompt" now available in the Start Menu, or at `%USERPROFILE%\miniconda3\shell\condabin\conda-hook.ps1`. +1. Install Miniforge for 64-bit Windows with the latest version of the installer Miniforge3-Windows-x86_64.exe + available on [github.com](https://github.com/conda-forge/miniforge/releases/) +2. Open the "Miniforge Command Prompt" now available in the Start Menu, or at `%USERPROFILE%\miniforge3\shell\condabin\conda-hook.ps1`. 3. In that command prompt window, use `cd` to change to the directory where you want to build OpenBLAS. 4. Now install all of the tools we need: ``` conda update -n base conda - conda config --add channels conda-forge - conda install -y cmake flang clangdev perl libflang ninja + conda install -y cmake flang_win-64 clangdev perl libflang ninja ``` + (if you want to build with OpenMP support, add `llvm-openmp` and `llvm-openmp-fortran`) 5. Still in the Anaconda Command Prompt window, activate the 64-bit MSVC environment with `vcvarsall x64`. On Windows 11 with Visual Studio 2022, this would be done by invoking: @@ -623,14 +627,22 @@ Note: using `TARGET=CORTEXA57` in place of `ARMV8` will pick up better optimized routines. Implementations for the `CORTEXA57` target are compatible with all other `ARMV8` targets. -Note: for NDK 23b, something as simple as: +Note: for NDK 23b and later, something as simple as: ```bash export PATH=/opt/android-ndk-r23b/toolchains/llvm/prebuilt/linux-x86_64/bin/:$PATH -make HOSTCC=gcc CC=/opt/android-ndk-r23b/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android31-clang ONLY_CBLAS=1 TARGET=ARMV8 +make HOSTCC=gcc CC=/opt/android-ndk-r23b/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android31-clang ONLY_CBLAS=1 TARGET=ARMV8 RANLIB=echo ``` appears to be sufficient on Linux. On OSX, setting AR to the ar provided in the "bin" path of the NDK (probably `llvm-ar`) is also necessary. +If you prefer building with CMake, running +```bash +cmake -DANDROID_ABI=arm64-v8a -DTARGET=ARMV8 -DCMAKE_TOOLCHAIN_FILE=/opt/android-ndk-r27/build/cmake/android.toolchain.cmake -DNOFORTRAN=1 -DANDROID_PLATFORM=android-23 .. +cmake --build . +``` +in your build directory should work (be sure to adjust the toolchain_file argument according to where you installed the NDK, and the ANDROID_PLATFORM +according to the minimum version of Android you want to support. (If you leave out the ANDROID_PLATFORM parameter, the build will fail with an error +message about a missing declaration or missing header file complex.h) ??? note "Alternative build script for 3 architectures" @@ -700,9 +712,10 @@ fully working OpenBLAS for this platform. Go to the directory where you unpacked OpenBLAS,and enter the following commands: ```bash -CC=/Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang +CC="/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" -CFLAGS= -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch arm64 -miphoneos-version-min=10.0 +SDKROOT="$(xcrun --sdk iphoneos --show-sdk-path)" +CFLAGS="-O2 -Wno-macro-redefined -isysroot $SDKROOT -arch arm64 -miphoneos-version-min=10.0" make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 ``` diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index c52b461a7e..a0360f7e1a 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -202,6 +202,15 @@ if (BUILD_BFLOAT16) if (USE_THREAD) GenerateNamedObjects("sbgemv_thread.c" "" "gemv_thread_n" false "" "" false "BFLOAT16") GenerateNamedObjects("sbgemv_thread.c" "TRANSA" "gemv_thread_t" false "" "" false "BFLOAT16") + GenerateNamedObjects("sbgemv_thread.c" "BGEMM;BFLOAT16" "bgemv_thread_n" false "" "" true "") + GenerateNamedObjects("sbgemv_thread.c" "BGEMM;BFLOAT16;TRANSA" "bgemv_thread_t" false "" "" true "") + endif () +endif () + +if (BUILD_HFLOAT16) + if (USE_THREAD) + GenerateNamedObjects("sbgemv_thread.c" "" "gemv_thread_n" false "" "" false "HFLOAT16") + GenerateNamedObjects("sbgemv_thread.c" "TRANSA" "gemv_thread_t" false "" "" false "HFLOAT16") endif () endif () diff --git a/driver/level2/Makefile b/driver/level2/Makefile index 5f8c712a8e..d50e70bcd1 100644 --- a/driver/level2/Makefile +++ b/driver/level2/Makefile @@ -1,3 +1,31 @@ +############################################################################### +# Copyright (c) 2025 The OpenBLAS Project +# All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# 3. Neither the name of the OpenBLAS project nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +############################################################################### + TOPDIR = ../.. include ../../Makefile.system @@ -422,7 +450,16 @@ XBLASOBJS += \ xtbmv_thread_CUU.$(SUFFIX) xtbmv_thread_CUN.$(SUFFIX) \ xtbmv_thread_CLU.$(SUFFIX) xtbmv_thread_CLN.$(SUFFIX) + +ifeq ($(BUILD_HFLOAT16),1) +SHBLASOBJS += \ + shgemv_thread_n$(TSUFFIX).$(SUFFIX) \ + shgemv_thread_t$(TSUFFIX).$(SUFFIX) +endif ifeq ($(BUILD_BFLOAT16),1) +BBLASOBJS += \ + bgemv_thread_n$(TSUFFIX).$(SUFFIX) \ + bgemv_thread_t$(TSUFFIX).$(SUFFIX) SBBLASOBJS += \ sbgemv_thread_n$(TSUFFIX).$(SUFFIX) \ sbgemv_thread_t$(TSUFFIX).$(SUFFIX) @@ -3706,7 +3743,18 @@ xtrsv_CUU.$(SUFFIX) xtrsv_CUU.$(PSUFFIX) : ztrsv_L.c ../../param.h xtrsv_CUN.$(SUFFIX) xtrsv_CUN.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) +ifeq ($(BUILD_HFLOAT16),1) +shgemv_thread_n.$(SUFFIX) shgemv_thread_n.$(PSUFFIX) : sbgemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) +shgemv_thread_t.$(SUFFIX) shgemv_thread_t.$(PSUFFIX) : sbgemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) +endif + ifeq ($(BUILD_BFLOAT16),1) +bgemv_thread_n.$(SUFFIX) bgemv_thread_n.$(PSUFFIX) : sbgemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DBGEMM -UCOMPLEX -UDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) +bgemv_thread_t.$(SUFFIX) bgemv_thread_t.$(PSUFFIX) : sbgemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DBGEMM -UCOMPLEX -UDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) sbgemv_thread_n.$(SUFFIX) sbgemv_thread_n.$(PSUFFIX) : sbgemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) sbgemv_thread_t.$(SUFFIX) sbgemv_thread_t.$(PSUFFIX) : sbgemv_thread.c ../../common.h diff --git a/driver/level2/sbgemv_thread.c b/driver/level2/sbgemv_thread.c index 534c60f95b..c7fc90a350 100644 --- a/driver/level2/sbgemv_thread.c +++ b/driver/level2/sbgemv_thread.c @@ -1,4 +1,5 @@ /*********************************************************************/ +/* Copyright 2025 The OpenBLAS Project. */ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ @@ -41,21 +42,21 @@ #include "common.h" #ifndef TRANSA -#define SBGEMV SBGEMV_N +#define GEMV GEMV_N #else -#define SBGEMV SBGEMV_T +#define GEMV GEMV_T #endif static int sbgemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *dummy2, BLASLONG dummy3){ - bfloat16 *a, *x; - float *y; + IFLOAT *a, *x; + FLOAT *y; BLASLONG lda, incx, incy; BLASLONG m_from, m_to, n_from, n_to; - a = (bfloat16 *)args->a; - x = (bfloat16 *)args->b; - y = (float *)args->c; + a = (IFLOAT *)args->a; + x = (IFLOAT *)args->b; + y = (FLOAT *)args->c; lda = args->lda; incx = args->ldb; @@ -77,12 +78,12 @@ static int sbgemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, y += n_from * incy; #endif - SBGEMV(m_to - m_from, n_to - n_from, *((FLOAT *)(args->alpha)), a, lda, x, incx, *((FLOAT *)(args->beta)), y, incy); + GEMV(m_to - m_from, n_to - n_from, *((FLOAT *)(args->alpha)), a, lda, x, incx, *((FLOAT *)(args->beta)), y, incy); return 0; } -int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, BLASLONG incx, float beta, float *y, BLASLONG incy, int threads) +int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT *x, BLASLONG incx, FLOAT beta, FLOAT *y, BLASLONG incy, int threads) { blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index eabfeed24a..468628af2e 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -14,8 +14,16 @@ foreach (GEMM_DEFINE ${GEMM_DEFINES}) endif () if (BUILD_BFLOAT16) GenerateNamedObjects("gemm.c" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16") + GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};BGEMM" "gemm_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16") if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3) GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16") + GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3;BGEMM" "gemm_thread_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16") + endif () + endif () + if (BUILD_HFLOAT16) + GenerateNamedObjects("gemm.c" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0 "" "" false "HFLOAT16") + if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3) + GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0 "" "" false "HFLOAT16") endif () endif () endforeach () diff --git a/driver/level3/Makefile b/driver/level3/Makefile index c304838423..622996c3b8 100644 --- a/driver/level3/Makefile +++ b/driver/level3/Makefile @@ -1,3 +1,32 @@ +############################################################################### +# Copyright (c) 2025, The OpenBLAS Project +# All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# 3. Neither the name of the OpenBLAS project nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +############################################################################### + TOPDIR = ../.. include ../../Makefile.system @@ -20,9 +49,14 @@ USE_GEMM3M = 1 endif ifeq ($(BUILD_BFLOAT16),1) +BBLASOBJS += bgemm_nn.$(SUFFIX) bgemm_nt.$(SUFFIX) bgemm_tn.$(SUFFIX) bgemm_tt.$(SUFFIX) SBBLASOBJS += sbgemm_nn.$(SUFFIX) sbgemm_nt.$(SUFFIX) sbgemm_tn.$(SUFFIX) sbgemm_tt.$(SUFFIX) endif +ifeq ($(BUILD_HFLOAT16),1) +SHBLASOBJS += shgemm_nn.$(SUFFIX) shgemm_nt.$(SUFFIX) shgemm_tn.$(SUFFIX) shgemm_tt.$(SUFFIX) +endif + SBLASOBJS += \ sgemm_nn.$(SUFFIX) sgemm_nt.$(SUFFIX) sgemm_tn.$(SUFFIX) sgemm_tt.$(SUFFIX) \ strmm_LNUU.$(SUFFIX) strmm_LNUN.$(SUFFIX) strmm_LNLU.$(SUFFIX) strmm_LNLN.$(SUFFIX) \ @@ -208,8 +242,12 @@ COMMONOBJS += syrk_thread.$(SUFFIX) ifneq ($(USE_SIMPLE_THREADED_LEVEL3), 1) ifeq ($(BUILD_BFLOAT16),1) +BBLASOBJS += bgemm_thread_nn.$(SUFFIX) bgemm_thread_nt.$(SUFFIX) bgemm_thread_tn.$(SUFFIX) bgemm_thread_tt.$(SUFFIX) SBBLASOBJS += sbgemm_thread_nn.$(SUFFIX) sbgemm_thread_nt.$(SUFFIX) sbgemm_thread_tn.$(SUFFIX) sbgemm_thread_tt.$(SUFFIX) endif +ifeq ($(BUILD_HFLOAT16),1) +SHBLASOBJS += shgemm_thread_nn.$(SUFFIX) shgemm_thread_nt.$(SUFFIX) shgemm_thread_tn.$(SUFFIX) shgemm_thread_tt.$(SUFFIX) +endif SBLASOBJS += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX) DBLASOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX) QBLASOBJS += qgemm_thread_nn.$(SUFFIX) qgemm_thread_nt.$(SUFFIX) qgemm_thread_tn.$(SUFFIX) qgemm_thread_tt.$(SUFFIX) @@ -343,17 +381,41 @@ endif all :: +bgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +bgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +bgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +bgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) + sbgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) sbgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) sbgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) sbgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) + +shgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +shgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +shgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +shgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) sgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) @@ -550,17 +612,41 @@ gemm_thread_variable.$(SUFFIX) : gemm_thread_variable.c ../../common.h beta_thread.$(SUFFIX) : beta_thread.c ../../common.h $(CC) -c $(CFLAGS) $< -o $(@F) +bgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +bgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +bgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +bgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) + sbgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DBFLOAT16 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) sbgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DBFLOAT16 -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) sbgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DBFLOAT16 -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) sbgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DBFLOAT16 -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) + +shgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHFLOAT16 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +shgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHFLOAT16 -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +shgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHFLOAT16 -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +shgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHFLOAT16 -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) sgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) @@ -2736,16 +2822,28 @@ xtrsm_RCLN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) sbgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) sbgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) sbgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) sbgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) + +shgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +shgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +shgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +shgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) sgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) @@ -2959,16 +3057,28 @@ zgemm_batch_thread.$(SUFFIX) : gemm_batch_thread.c ../../common.h sbgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DBFLOAT16 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) sbgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DBFLOAT16 -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) sbgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DBFLOAT16 -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) sbgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DBFLOAT16 -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) + +shgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHFLOAT16 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +shgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHFLOAT16 -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +shgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHFLOAT16 -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +shgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHFLOAT16 -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) sgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) diff --git a/driver/level3/gemm.c b/driver/level3/gemm.c index 2b13da7d70..e37d86c28d 100644 --- a/driver/level3/gemm.c +++ b/driver/level3/gemm.c @@ -59,6 +59,14 @@ #define GEMM_Q 128 #endif +#ifdef GEMM_DIVIDE_RATE +#define DIVIDE_RATE GEMM_DIVIDE_RATE +#endif + +#ifdef GEMM_DIVIDE_LIMIT +#define DIVIDE_LIMIT GEMM_DIVIDE_LIMIT +#endif + #ifdef THREADED_LEVEL3 #include "level3_thread.c" #else diff --git a/driver/level3/level3.c b/driver/level3/level3.c index b7328876b4..78bc6aa527 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2025 The OpenBLAS Project. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -169,6 +170,22 @@ #define STOP_RPCC(COUNTER) #endif +#if defined(BUILD_BFLOAT16) +#if defined(DYNAMIC_ARCH) + #if defined(BGEMM) + #define BFLOAT16_ALIGN_K gotoblas->bgemm_align_k + #else + #define BFLOAT16_ALIGN_K gotoblas->sbgemm_align_k + #endif +#else + #if defined(BGEMM) + #define BFLOAT16_ALIGN_K BGEMM_ALIGN_K + #else + #define BFLOAT16_ALIGN_K SBGEMM_ALIGN_K + #endif +#endif +#endif + int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, XFLOAT *sa, XFLOAT *sb, BLASLONG dummy){ BLASLONG k, lda, ldb, ldc; @@ -305,12 +322,8 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } BLASLONG pad_min_l = min_l; -#if defined(HALF) -#if defined(DYNAMIC_ARCH) - pad_min_l = (min_l + gotoblas->sbgemm_align_k - 1) & ~(gotoblas->sbgemm_align_k-1); -#else - pad_min_l = (min_l + SBGEMM_ALIGN_K - 1) & ~(SBGEMM_ALIGN_K - 1);; -#endif +#if defined(BFLOAT16) + pad_min_l = (min_l + BFLOAT16_ALIGN_K - 1) & ~(BFLOAT16_ALIGN_K - 1); #endif /* First, we have to move data A to L2 cache */ diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index db3bffc10a..327dc2d01d 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -1,6 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ -/* Copyright 2023 The OpenBLAS Project. */ +/* Copyright 2023, 2025 The OpenBLAS Project. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -216,6 +216,22 @@ typedef struct { #define STOP_RPCC(COUNTER) #endif +#if defined(BUILD_BFLOAT16) +#if defined(DYNAMIC_ARCH) + #if defined(BGEMM) + #define BFLOAT16_ALIGN_K gotoblas->bgemm_align_k + #else + #define BFLOAT16_ALIGN_K gotoblas->sbgemm_align_k + #endif +#else + #if defined(BGEMM) + #define BFLOAT16_ALIGN_K BGEMM_ALIGN_K + #else + #define BFLOAT16_ALIGN_K SBGEMM_ALIGN_K + #endif +#endif +#endif + static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IFLOAT *sb, BLASLONG mypos){ IFLOAT *buffer[DIVIDE_RATE]; @@ -230,6 +246,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, BLASLONG nthreads_m; BLASLONG mypos_m, mypos_n; + BLASLONG divide_rate = DIVIDE_RATE; BLASLONG is, js, ls, bufferside, jjs; BLASLONG min_i, min_l, div_n, min_jj; @@ -264,6 +281,11 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, alpha = (FLOAT *)args -> alpha; beta = (FLOAT *)args -> beta; + /* Disable divide_rate when N of all threads are less than to DIVIDE_LIMIT */ +#ifdef DIVIDE_LIMIT + if (N < DIVIDE_LIMIT) divide_rate = 1; +#endif + /* Initialize 2D CPU distribution */ nthreads_m = args -> nthreads; if (range_m) { @@ -305,9 +327,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, ) return 0; /* Initialize workspace for local region of B */ - div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; + div_n = (n_to - n_from + divide_rate - 1) / divide_rate; buffer[0] = sb; - for (i = 1; i < DIVIDE_RATE; i++) { + for (i = 1; i < divide_rate; i++) { buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE; } @@ -324,12 +346,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, BLASLONG pad_min_l = min_l; -#if defined(HALF) -#if defined(DYNAMIC_ARCH) - pad_min_l = (min_l + gotoblas->sbgemm_align_k - 1) & ~(gotoblas->sbgemm_align_k-1); -#else - pad_min_l = (min_l + SBGEMM_ALIGN_K - 1) & ~(SBGEMM_ALIGN_K - 1);; -#endif +#if defined(BFLOAT16) + pad_min_l = (min_l + BFLOAT16_ALIGN_K - 1) & ~(BFLOAT16_ALIGN_K - 1); #endif /* Determine step size in m @@ -353,7 +371,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, STOP_RPCC(copy_A); /* Copy local region of B into workspace and apply kernel */ - div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; + div_n = (n_to - n_from + divide_rate - 1) / divide_rate; for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) { /* Make sure if no one is using workspace */ @@ -422,7 +440,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, if (current >= (mypos_n + 1) * nthreads_m) current = mypos_n * nthreads_m; /* Split other region of B into parts */ - div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; + div_n = (range_n[current + 1] - range_n[current] + divide_rate - 1) / divide_rate; for (js = range_n[current], bufferside = 0; js < range_n[current + 1]; js += div_n, bufferside ++) { if (current != mypos) { @@ -473,7 +491,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, do { /* Split region of B into parts and apply kernel */ - div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; + div_n = (range_n[current + 1] - range_n[current] + divide_rate - 1) / divide_rate; for (js = range_n[current], bufferside = 0; js < range_n[current + 1]; js += div_n, bufferside ++) { /* Apply kernel with local region of A and part of region of B */ @@ -508,7 +526,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Wait until all other threads are done with local region of B */ START_RPCC(); for (i = 0; i < args -> nthreads; i++) { - for (js = 0; js < DIVIDE_RATE; js++) { + for (js = 0; js < divide_rate; js++) { while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;}; } } @@ -570,8 +588,6 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock); #else static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER; - static pthread_cond_t level3_wakeup = PTHREAD_COND_INITIALIZER; - volatile static BLASLONG CPU_AVAILABLE = MAX_CPU_NUMBER; #endif blas_arg_t newarg; @@ -641,12 +657,6 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG EnterCriticalSection((PCRITICAL_SECTION)&level3_lock); #else pthread_mutex_lock(&level3_lock); - while(CPU_AVAILABLE < nthreads) { - pthread_cond_wait(&level3_wakeup, &level3_lock); - } - CPU_AVAILABLE -= nthreads; - WMB; - pthread_mutex_unlock(&level3_lock); #endif #ifdef USE_ALLOC_HEAP @@ -798,10 +808,6 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG #elif defined(OS_WINDOWS) LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock); #else - pthread_mutex_lock(&level3_lock); - CPU_AVAILABLE += nthreads; - WMB; - pthread_cond_signal(&level3_wakeup); pthread_mutex_unlock(&level3_lock); #endif diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index ebcc0aa781..a07a0baf99 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -13,6 +13,8 @@ if (USE_THREAD) set(BLAS_SERVER blas_server_omp.c) elseif (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") set(BLAS_SERVER blas_server_win32.c) + elseif (${CMAKE_SYSTEM_NAME} STREQUAL CYGWIN) + set(BLAS_SERVER blas_server_win32.c) elseif (${CMAKE_SYSTEM_NAME} STREQUAL "WindowsStore") set(BLAS_SERVER blas_server_win32.c) endif () @@ -52,6 +54,8 @@ if (DYNAMIC_ARCH) list(APPEND COMMON_SOURCES dynamic_arm64.c) elseif (POWER) list(APPEND COMMON_SOURCES dynamic_power.c) + elseif (ZARCH) + list(APPEND COMMON_SOURCES dynamic_zarch.c) elseif (RISCV64) list(APPEND COMMON_SOURCES dynamic_riscv64.c detect_riscv64.c) elseif (LOONGARCH64) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 4b79136ec7..ed6a850890 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -119,11 +119,11 @@ static void * blas_thread_buffer[MAX_CPU_NUMBER]; /* Local Variables */ #if defined(USE_PTHREAD_LOCK) -static pthread_mutex_t server_lock = PTHREAD_MUTEX_INITIALIZER; +volatile static pthread_mutex_t server_lock = PTHREAD_MUTEX_INITIALIZER; #elif defined(USE_PTHREAD_SPINLOCK) -static pthread_spinlock_t server_lock = 0; +volatile static pthread_spinlock_t server_lock = 0; #else -static unsigned long server_lock = 0; +volatile static unsigned long server_lock = 0; #endif #define THREAD_STATUS_SLEEP 2 @@ -637,9 +637,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ #ifdef SMP_SERVER // Handle lazy re-init of the thread-pool after a POSIX fork - LOCK_COMMAND(&server_lock); if (unlikely(blas_server_avail == 0)) blas_thread_init(); - UNLOCK_COMMAND(&server_lock); #endif BLASLONG i = 0; blas_queue_t *current = queue; diff --git a/driver/others/detect_riscv64.c b/driver/others/detect_riscv64.c index 5a5cc03916..dc3bb82099 100644 --- a/driver/others/detect_riscv64.c +++ b/driver/others/detect_riscv64.c @@ -63,12 +63,12 @@ uint64_t detect_riscv64_rvv100(void) * RVV 1.0 and we return 0. */ - asm volatile("vsetvli x0, x0, e8, m1, ta, ma\n\t" + asm volatile("vsetvli t0, x0, e8, m1, ta, ma\n\t" "csrr %0, vtype\n\t" "slt %0, x0, %0\n" : "=r" (rvv10_supported) : - :); + :"t0", "vtype"); return rvv10_supported; } diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 70b51f6fce..412057f9e2 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -128,6 +128,12 @@ extern gotoblas_t gotoblas_ARMV9SME; #else #define gotoblas_ARMV9SME gotoblas_ARMV8 #endif +#ifdef DYN_VORTEXM4 +extern gotoblas_t gotoblas_VORTEXM4; +#else +#error "dont have vortexm4" +#define gotoblas_VORTEXM4 gotoblas_ARMV8 +#endif #ifdef DYN_CORTEXA55 extern gotoblas_t gotoblas_CORTEXA55; #else @@ -155,17 +161,26 @@ extern gotoblas_t gotoblas_NEOVERSEV1; extern gotoblas_t gotoblas_NEOVERSEN2; extern gotoblas_t gotoblas_ARMV8SVE; extern gotoblas_t gotoblas_A64FX; -#ifndef NO_SME -extern gotoblas_t gotoblas_ARMV9SME; -#else -#define gotoblas_ARMV9SME gotoblas_ARMV8SVE -#endif #else #define gotoblas_NEOVERSEV1 gotoblas_ARMV8 #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 #define gotoblas_ARMV8SVE gotoblas_ARMV8 #define gotoblas_A64FX gotoblas_ARMV8 -#define gotoblas_ARMV9SME gotoblas_ARMV8 +#endif +#ifndef NO_SME +extern gotoblas_t gotoblas_ARMV9SME; +#if defined (__clang__) && defined(OS_DARWIN) +extern gotoblas_t gotoblas_VORTEXM4; +#else +#define gotoblas_VORTEXM4 gotoblas_NEOVERSEN1 +#endif +#else +#ifndef NO_SVE +#define gotoblas_ARMV9SME gotoblas_ARMV8SVE +#else +#define gotoblas_ARMV9SME gotoblas_NEOVERSEN1 +#endif +#define gotoblas_VORTEXM4 gotoblas_NEOVERSEN1 #endif extern gotoblas_t gotoblas_THUNDERX3T110; @@ -176,7 +191,7 @@ extern void openblas_warning(int verbose, const char * msg); #define FALLBACK_VERBOSE 1 #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" -#define NUM_CORETYPES 19 +#define NUM_CORETYPES 20 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -216,6 +231,7 @@ static char *corename[] = { "armv8sve", "a64fx", "armv9sme", + "vortexm4", "unknown" }; @@ -239,6 +255,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_ARMV8SVE) return corename[16]; if (gotoblas == &gotoblas_A64FX) return corename[17]; if (gotoblas == &gotoblas_ARMV9SME) return corename[18]; + if (gotoblas == &gotoblas_VORTEXM4) return corename[19]; return corename[NUM_CORETYPES]; } @@ -277,6 +294,7 @@ static gotoblas_t *force_coretype(char *coretype) { case 16: return (&gotoblas_ARMV8SVE); case 17: return (&gotoblas_A64FX); case 18: return (&gotoblas_ARMV9SME); + case 19: return (&gotoblas_VORTEXM4); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -288,11 +306,11 @@ static gotoblas_t *get_coretype(void) { char coremsg[128]; #if defined (OS_DARWIN) -//future #if !defined(NO_SME) -// if (support_sme1()) { -// return &gotoblas_ARMV9SME; -// } -// #endif +#if !defined(NO_SME) + if (support_sme1()) { + return &gotoblas_VORTEXM4; + } +#endif return &gotoblas_NEOVERSEN1; #endif @@ -463,7 +481,7 @@ static gotoblas_t *get_coretype(void) { } break; case 0x61: // Apple -//future if (support_sme1()) return &gotoblas_ARMV9SME; + if (support_sme1()) return &gotoblas_VORTEXM4; return &gotoblas_NEOVERSEN1; break; default: diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 4c1f4a26e0..34189869c5 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -69,7 +69,10 @@ static int cpuid(void) else if (arch == POWER_9) return CPU_POWER9; #endif #ifdef POWER_10 - else if (arch >= POWER_10) return CPU_POWER10; + else if (arch == POWER_10) return CPU_POWER10; +#endif +#ifdef POWER_11 + else if (arch >= POWER_11) return CPU_POWER10; #endif return CPU_UNKNOWN; } @@ -173,6 +176,13 @@ static struct { .cpu_type = CPU_POWER10, }, + { /* Power11 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x00820000, + .cpu_name = "POWER11 (raw)", + .cpu_type = CPU_POWER10, + }, + { /* End of table, pvr_mask and pvr_value must be zero */ .pvr_mask = 0x0, .pvr_value = 0x0, diff --git a/driver/others/dynamic_riscv64.c b/driver/others/dynamic_riscv64.c index 78e3bb67a4..726f76d790 100644 --- a/driver/others/dynamic_riscv64.c +++ b/driver/others/dynamic_riscv64.c @@ -97,6 +97,9 @@ struct riscv_hwprobe { #define RISCV_HWPROBE_KEY_IMA_EXT_0 4 #define RISCV_HWPROBE_IMA_V (1 << 2) +#define RISCV_HWPROBE_EXT_ZFH (1 << 27) +#define RISCV_HWPROBE_EXT_ZVFH (1 << 30) +#define RISCV_HWPROBE_EXT_ZVFBFWMA (1 << 54) #ifndef NR_riscv_hwprobe #ifndef NR_arch_specific_syscall @@ -147,6 +150,7 @@ char* gotoblas_corename(void) { } static gotoblas_t* get_coretype(void) { + uint64_t vector_mask; unsigned vlenb = 0; #if !defined(OS_LINUX) @@ -165,14 +169,31 @@ static gotoblas_t* get_coretype(void) { }; int ret = syscall(NR_riscv_hwprobe, pairs, 1, 0, NULL, 0); if (ret == 0) { - if (!(pairs[0].value & RISCV_HWPROBE_IMA_V)) +#if defined(BUILD_HFLOAT16) + vector_mask = (RISCV_HWPROBE_IMA_V | RISCV_HWPROBE_EXT_ZFH | RISCV_HWPROBE_EXT_ZVFH); +#elif defined(BUILD_BFLOAT16) + vector_mask = (RISCV_HWPROBE_IMA_V | RISCV_HWPROBE_EXT_ZVFBFWMA); +#else + vector_mask = RISCV_HWPROBE_IMA_V; +#endif + if ((pairs[0].value & vector_mask) != vector_mask) return NULL; } else { +#if defined(BUILD_HFLOAT16) + snprintf(coremsg, sizeof(coremsg), "Cpu support for Zfh+Zvfh extensions required due to BUILD_HFLOAT16=1\n"); + openblas_warning(1, coremsg); + return NULL; +#elif defined(BUILD_BFLOAT16) + snprintf(coremsg, sizeof(coremsg), "Cpu support for Zvfbfwma extensions required due to BUILD_BFLOAT16=1\n"); + openblas_warning(1, coremsg); + return NULL; +#else if (!(getauxval(AT_HWCAP) & DETECT_RISCV64_HWCAP_ISA_V)) return NULL; if (!detect_riscv64_rvv100()) return NULL; +#endif } /* diff --git a/driver/others/init.c b/driver/others/init.c index cd10e8d369..f27955184b 100644 --- a/driver/others/init.c +++ b/driver/others/init.c @@ -391,7 +391,15 @@ static void numa_mapping(void) { core = 0; for (cpu = 0; cpu < common -> num_procs; cpu ++) { bitmask_idx = CPUELT(cpu); +/* + * When common->avail[i] = 0x5555555555555555UL (indicating that adjacent logical cores share a physical core), + * using it as a mask may overlap with the local_cpu_map function's role, leading to only half of the real physical cores being detected. + */ +#ifdef ARCH_LOONGARCH64 + if (common -> node_info[node][bitmask_idx]) { +#else if (common -> node_info[node][bitmask_idx] & common -> avail[bitmask_idx] & CPUMASK(cpu)) { +#endif common -> cpu_info[count] = WRITE_CORE(core) | WRITE_NODE(node) | WRITE_CPU(cpu); count ++; core ++; @@ -930,8 +938,12 @@ void gotoblas_affinity_init(void) { if (common -> num_nodes > 1) numa_mapping(); +#ifdef ARCH_LOONGARCH64 + common -> final_num_procs = common -> num_procs; +#else common -> final_num_procs = 0; for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1; //Make the max cpu number. +#endif for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0; diff --git a/driver/others/memory.c b/driver/others/memory.c index c53e798bc1..c8415f348e 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2922,6 +2922,7 @@ void *blas_memory_alloc(int procpos){ blas_unlock(&memory[position].lock); #endif if (!memory[position].addr) { + int failcount = 0; do { #ifdef DEBUG printf("Allocation Start : %lx\n", base_address); @@ -2973,8 +2974,16 @@ void *blas_memory_alloc(int procpos){ #ifdef DEBUG printf(" Success -> %08lx\n", map_address); #endif - if (((BLASLONG) map_address) == -1) base_address = 0UL; - + if (((BLASLONG) map_address) == -1) { + base_address = 0UL; + failcount++; + if (failcount >10) { + fprintf(stderr, "OpenBLAS error: Memory allocation still failed after 10 retries, giving up.\n"); + exit(1); + } + } else { + failcount = 0; + } if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE; } while ((BLASLONG)map_address == -1); diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 597e5cac7e..9a1ff57358 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -67,6 +67,16 @@ BLASLONG sbgemm_p = DEFAULT_GEMM_P; #else BLASLONG sbgemm_p = SBGEMM_P; #endif +#if SHGEMM_P == shgemm_p +BLASLONG shgemm_p = DEFAULT_GEMM_P; +#else +BLASLONG shgemm_p = SHGEMM_P; +#endif +#if BGEMM_P == bgemm_p +BLASLONG bgemm_p = DEFAULT_GEMM_P; +#else +BLASLONG bgemm_p = BGEMM_P; +#endif #if SGEMM_P == sgemm_p BLASLONG sgemm_p = DEFAULT_GEMM_P; #else @@ -93,6 +103,16 @@ BLASLONG sbgemm_q = DEFAULT_GEMM_Q; #else BLASLONG sbgemm_q = SBGEMM_Q; #endif +#if SHGEMM_Q == shgemm_q +BLASLONG shgemm_q = DEFAULT_GEMM_Q; +#else +BLASLONG shgemm_q = SHGEMM_Q; +#endif +#if BGEMM_Q == bgemm_q +BLASLONG bgemm_q = DEFAULT_GEMM_Q; +#else +BLASLONG bgemm_q = BGEMM_Q; +#endif #if SGEMM_Q == sgemm_q BLASLONG sgemm_q = DEFAULT_GEMM_Q; #else @@ -119,6 +139,16 @@ BLASLONG sbgemm_r = DEFAULT_GEMM_R; #else BLASLONG sbgemm_r = SBGEMM_R; #endif +#if SHGEMM_R == shgemm_r +BLASLONG shgemm_r = DEFAULT_GEMM_R; +#else +BLASLONG shgemm_r = SHGEMM_R; +#endif +#if BGEMM_R == bgemm_r +BLASLONG bgemm_r = DEFAULT_GEMM_R; +#else +BLASLONG bgemm_r = BGEMM_R; +#endif #if SGEMM_R == sgemm_r BLASLONG sgemm_r = DEFAULT_GEMM_R; #else @@ -526,6 +556,10 @@ void blas_set_parameter(void){ #ifdef BUILD_BFLOAT16 sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15; + bgemm_r = (((BUFFER_SIZE - ((BGEMM_P * BGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (BGEMM_Q * 4)) - 15) & ~15; +#endif +#ifdef BUILD_HFLOAT16 + shgemm_r = (((BUFFER_SIZE - ((SHGEMM_P * SHGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SHGEMM_Q * 4)) - 15) & ~15; #endif sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; @@ -619,6 +653,7 @@ void blas_set_parameter(void){ size = BITMASK(cpuid3, 16, 0xff); sbgemm_p = 192 * (size + 1); + shgemm_p = 192 * (size + 1); sgemm_p = 192 * (size + 1); dgemm_p = 96 * (size + 1); cgemm_p = 96 * (size + 1); @@ -634,6 +669,10 @@ void blas_set_parameter(void){ #ifdef BUILD_BFLOAT16 sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15; + bgemm_r = (((BUFFER_SIZE - ((BGEMM_P * BGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (BGEMM_Q * 4)) - 15) & ~15; +#endif +#ifdef BUILD_HFLOAT16 + shgemm_r = (((BUFFER_SIZE - ((SHGEMM_P * SHGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SHGEMM_Q * 4)) - 15) & ~15; #endif sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; diff --git a/exports/Makefile b/exports/Makefile index 04fc64cfe0..176b1a7662 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -39,6 +39,9 @@ endif ifndef BUILD_BFLOAT16 BUILD_BFLOAT16 = 0 endif +ifndef BUILD_HFLOAT16 +BUILD_HFLOAT16 = 0 +endif ifndef BUILD_SINGLE BUILD_SINGLE = 0 endif @@ -130,10 +133,10 @@ dll : ../$(LIBDLLNAME) -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB) $(LIBPREFIX).def : $(GENSYM) - ./$(GENSYM) win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) + ./$(GENSYM) win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) libgoto_hpl.def : $(GENSYM) - ./$(GENSYM) win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) + ./$(GENSYM) win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) ifeq ($(OSNAME), Darwin) ifeq ($(FIXED_LIBNAME),1) @@ -298,23 +301,23 @@ static : ../$(LIBNAME) rm -f goto.$(SUFFIX) osx.def : $(GENSYM) ../Makefile.system ../getarch.c - ./$(GENSYM) osx $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) + ./$(GENSYM) osx $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) aix.def : $(GENSYM) ../Makefile.system ../getarch.c - ./$(GENSYM) aix $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) + ./$(GENSYM) aix $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) objcopy.def : $(GENSYM) ../Makefile.system ../getarch.c - ./$(GENSYM) objcopy $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) + ./$(GENSYM) objcopy $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) objconv.def : $(GENSYM) ../Makefile.system ../getarch.c - ./$(GENSYM) objconv $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) + ./$(GENSYM) objconv $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) test : linktest.c $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. rm -f linktest linktest.c : $(GENSYM) ../Makefile.system ../getarch.c - ./$(GENSYM) linktest $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c + ./$(GENSYM) linktest $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c clean :: @rm -f *.def *.dylib __.SYMDEF* *.renamed diff --git a/exports/gensymbol b/exports/gensymbol index f747dd091f..6284e6b6ae 100755 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -1,5 +1,33 @@ #!/bin/sh +############################################################################### +# Copyright (c) 2025, The OpenBLAS Project +# All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# 3. Neither the name of the OpenBLAS project nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +############################################################################### + # Changelog # 2017/09/03 staticfloat # Added zsymv and csymv into @lapackobjs2 so they are properly renamed @@ -51,7 +79,8 @@ blasobjsz=" zgeadd dzsum zgemmt zgemmtr" blasobjs="lsame xerbla" -bfblasobjs="sbgemm sbgemmt sbgemmtr sbgemv sbdot sbstobf16 sbdtobf16 sbf16tos dbf16tod" +bfblasobjs="bgemm bgemv sbgemm sbgemmt sbgemmtr sbgemv sbdot sbstobf16 sbdtobf16 sbf16tos dbf16tod" +hfblasobjs="shgemm shgemv" cblasobjsc=" cblas_caxpy cblas_ccopy cblas_cdotc cblas_cdotu cblas_cgbmv cblas_cgemm cblas_cgemv cblas_cgerc cblas_cgeru cblas_chbmv cblas_chemm cblas_chemv cblas_cher2 cblas_cher2k @@ -100,6 +129,7 @@ cblasobjsz=" cblasobjs="cblas_xerbla" bfcblasobjs="cblas_sbgemm cblas_sbgemv cblas_sbdot cblas_sbstobf16 cblas_sbdtobf16 cblas_sbf16tos cblas_dbf16tod cblas_sbgemm_batch" +hfcblasobjs="cblas_shgemm" exblasobjs=" qamax qamin qasum qaxpy qcabs1 qcopy qdot qgbmv qgemm @@ -151,6 +181,7 @@ misc_no_underscore_objs=" goto_set_num_threads openblas_get_config openblas_get_corename + openblas_set_threads_callback_function " misc_underscore_objs="" @@ -3814,6 +3845,8 @@ shift p16=$9 shift p17=$9 +shift +p18=$9 if [ $p13 -eq 1 ]; then blasobjs="$blasobjs $bfblasobjs" @@ -3821,6 +3854,11 @@ if [ $p13 -eq 1 ]; then fi if [ $p14 -eq 1 ]; then + blasobjs="$blasobjs $hfblasobjs" + cblasobjs="$cblasobjs $hfcblasobjs" +fi + +if [ $p15 -eq 1 ]; then blasobjs="$blasobjs $blasobjss" cblasobjs="$cblasobjs $cblasobjss" lapackobjs="$lapackobjs $lapackobjss" @@ -3833,11 +3871,11 @@ if [ $p14 -eq 1 ]; then lapackeobjs="$lapackeobjs $lapackeobjss" fi -if [ $p15 -eq 1 ]; then +if [ $p16 -eq 1 ]; then blasobjs="$blasobjs $blasobjsd" cblasobjs="$cblasobjs $cblasobjsd" lapackobjs="$lapackobjs $lapackobjsd" - if [ $p14 -eq 0 ]; then + if [ $p15 -eq 0 ]; then lapackobjs2="$lapackobjs2 $lapackobjs2ds" fi lapackobjs2="$lapackobjs2 $lapackobjs2d $lapackobjs2dz" @@ -3847,14 +3885,14 @@ if [ $p15 -eq 1 ]; then lapackeobjs="$lapackeobjs $lapackeobjsd" fi -if [ $p16 -eq 1 ]; then +if [ $p17 -eq 1 ]; then blasobjs="$blasobjs $blasobjsc" cblasobjs="$cblasobjs $cblasobjsc" gemm3mobjs="$gemm3mobjs $gemm3mobjsc" cblasgemm3mobjs="$cblasgemm3mobjs $cblasgemm3mobjsc" lapackobjs="$lapackobjs $lapackobjsc" lapackobjs2="$lapackobjs2 $lapackobjs2c $lapackobjs2zc" - if [ $p14 -eq 0 ]; then + if [ $p15 -eq 0 ]; then lapackobjs2="$lapackobjs2 $lapackobjs2sc" fi lapack_deprecated_objs="$lapack_deprecated_objs $lapack_deprecated_objsc" @@ -3863,17 +3901,17 @@ if [ $p16 -eq 1 ]; then lapackeobjs="$lapackeobjs $lapackeobjsc" fi -if [ $p17 -eq 1 ]; then +if [ $p18 -eq 1 ]; then blasobjs="$blasobjs $blasobjsz" cblasobjs="$cblasobjs $cblasobjsz" gemm3mobjs="$gemm3mobjs $gemm3mobjsz" cblasgemm3mobjs="$cblasgemm3mobjs $cblasgemm3mobjsz" lapackobjs="$lapackobjs $lapackobjsz" lapackobjs2="$lapackobjs2 $lapackobjs2z" - if [ $p16 -eq 0 ]; then + if [ $p17 -eq 0 ]; then lapackobjs2="$lapackobjs2 $lapackobjs2zc" fi - if [ $p15 -eq 0 ]; then + if [ $p16 -eq 0 ]; then lapackobjs2="$lapackobjs2 $lapackobjs2dz" fi lapack_deprecated_objs="$lapack_deprecated_objs $lapack_deprecated_objsz" diff --git a/exports/gensymbol.pl b/exports/gensymbol.pl index 5597306343..85a6e9dbf5 100644 --- a/exports/gensymbol.pl +++ b/exports/gensymbol.pl @@ -1,5 +1,33 @@ #!/usr/bin/env perl +############################################################################### +# Copyright (c) 2025, The OpenBLAS Project +# All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# 3. Neither the name of the OpenBLAS project nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +############################################################################### + # Changelog # 2017/09/03 staticfloat # Added zsymv and csymv into @lapackobjs2 so they are properly renamed @@ -51,7 +79,8 @@ zgeadd, dzsum, zgemmt,zgemmtr); @blasobjs = (lsame, xerbla); -@bfblasobjs = (sbgemm, sbgemmt, sbgemmtr, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); +@bfblasobjs = (bgemm, bgemv, sbgemm, sbgemmt, sbgemmtr, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); +@hfblasobjs = (shgemm, shgemv); @cblasobjsc = ( cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, @@ -97,7 +126,7 @@ @cblasobjs = ( cblas_xerbla ); @bfcblasobjs = (cblas_sbgemm, cblas_sbgemmt, cblas_sbgemmtr, cblas_sbgemv, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod, cblas_sbgemm_batch); - +@hfcblasobjs = (cblas_shgemm); @exblasobjs = ( qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, qgemv,qger,qmax,qmin, @@ -148,6 +177,7 @@ goto_set_num_threads, openblas_get_config, openblas_get_corename, + openblas_set_threads_callback_function, ); @misc_underscore_objs = ( @@ -3777,6 +3807,10 @@ @cblasobjs = (@cblasobjs, @bfcblasobjs); } if ($ARGV[13] == 1) { + @blasobjs = (@blasobjs, @hfblasobjs); + @cblasobjs = (@cblasobjs, @hfcblasobjs); +} +if ($ARGV[14] == 1) { @blasobjs = (@blasobjs, @blasobjss); @cblasobjs = (@cblasobjs, @cblasobjss); @lapackobjs = (@lapackobjs, @lapackobjss); @@ -3788,11 +3822,11 @@ @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_s); @lapackeobjs = (@lapackeobjs, @lapackeobjss); } -if ($ARGV[14] == 1) { +if ($ARGV[15] == 1) { @blasobjs = (@blasobjs, @blasobjsd); @cblasobjs = (@cblasobjs, @cblasobjsd); @lapackobjs = (@lapackobjs, @lapackobjsd); - if ($ARGV[13] == 0) { + if ($ARGV[14] == 0) { @lapackobjs2 = (@lapackobjs2, @lapackobjs2ds); } @lapackobjs2 = (@lapackobjs2, @lapackobjs2d, @lapackobjs2dz); @@ -3801,14 +3835,14 @@ @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_d); @lapackeobjs = (@lapackeobjs, @lapackeobjsd); } -if ($ARGV[15] == 1) { +if ($ARGV[16] == 1) { @blasobjs = (@blasobjs, @blasobjsc); @cblasobjs = (@cblasobjs, @cblasobjsc); @gemm3mobjs = (@gemm3mobjs, @gemm3mobjsc); @cblasgemm3mobjs = (@cblasgemm3mobjs, @cblasgemm3mobjsc); @lapackobjs = (@lapackobjs, @lapackobjsc); @lapackobjs2 = (@lapackobjs2, @lapackobjs2c, @lapackobjs2zc); - if ($ARGV[13] == 0) { + if ($ARGV[14] == 0) { @lapackobjs2 = (@lapackobjs2, @lapackobjs2sc); } @lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objsc); @@ -3816,17 +3850,17 @@ @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_c); @lapackeobjs = (@lapackeobjs, @lapackeobjsc); } -if ($ARGV[16] == 1) { +if ($ARGV[17] == 1) { @blasobjs = (@blasobjs, @blasobjsz); @cblasobjs = (@cblasobjs, @cblasobjsz); @gemm3mobjs = (@gemm3mobjs, @gemm3mobjsz); @cblasgemm3mobjs = (@cblasgemm3mobjs, @cblasgemm3mobjsz); @lapackobjs = (@lapackobjs, @lapackobjsz); @lapackobjs2 = (@lapackobjs2, @lapackobjs2z); - if ($ARGV[15] == 0) { + if ($ARGV[16] == 0) { @lapackobjs2 = (@lapackobjs2, @lapackobjs2zc); } - if ($ARGV[14] == 0) { + if ($ARGV[15] == 0) { @lapackobjs2 = (@lapackobjs2, @lapackobjs2dz); } @lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objsz); diff --git a/f_check b/f_check index 244f6bcae3..156f0e5621 100755 --- a/f_check +++ b/f_check @@ -30,9 +30,11 @@ nofortran=0 shift 2 compiler="$*" compiler_bin="$1" +shift +compiler_args="$*" # f77 is too ambiguous -[ "$compiler" = "f77" ] && compiler='' +[ "$compiler_bin" = "f77" ] && compiler='' path=`split "$PATH" ':'` @@ -50,7 +52,7 @@ if [ -z "$compiler" ]; then for list in $lists; do for p in $path; do if [ -x "$p/$list" ]; then - compiler=$list + compiler="$list $compiler_args" compiler_bin=$list break 2 fi @@ -90,7 +92,7 @@ else vendor=FLANG openmp='-fopenmp' ;; - *GNU*|*GCC*) + *GCC*) v="${data#*GCC: *\) }" v="${v%%\"*}" diff --git a/getarch.c b/getarch.c index b51c3ed643..8b70248098 100644 --- a/getarch.c +++ b/getarch.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011-2014, The OpenBLAS Project +Copyright (c) 2011-2014, 2025 The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -158,6 +158,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_CSKY */ /* #define FORCE_CK860FV */ /* #define FORCE_GENERIC */ +/* #define FORCE_AMPERE1 */ #ifdef FORCE_P2 #define FORCE @@ -835,7 +836,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "POWER9" #endif -#if defined(FORCE_POWER10) +#if defined(FORCE_POWER10) || (FORCE_POWER11) #define FORCE #define ARCHITECTURE "POWER" #define SUBARCHITECTURE "POWER10" @@ -1475,7 +1476,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \ - "-march=armv8.4-a+sve -mtune=neoverse-v1" + "-march=armv8.4-a+sve+bf16 -mtune=neoverse-v1" #define LIBNAME "neoversev1" #define CORENAME "NEOVERSEV1" #endif @@ -1497,6 +1498,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "NEOVERSEN2" #endif +#ifdef FORCE_NEOVERSEV2 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "NEOVERSEV2" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DNEOVERSEV2 " \ + "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \ + "-mcpu=neoverse-v2" +#define LIBNAME "neoversev2" +#define CORENAME "NEOVERSEV2" +#endif + #ifdef FORCE_CORTEXA55 #define FORCE #define ARCHITECTURE "ARM64" @@ -1590,6 +1607,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "EMAG8180" #endif +#ifdef FORCE_AMPERE1 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "AMPERE1" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DAMPERE1 " \ + "-DL1_CODE_SIZE=16384 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \ + "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8 " \ + "-march=armv8.6-a+crypto+crc+fp16+sha3+rng" +#define LIBNAME "ampere1" +#define CORENAME "AMPERE1" +#endif + #ifdef FORCE_THUNDERX3T110 #define ARMV8 #define FORCE @@ -1621,6 +1654,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "VORTEX" #endif +#ifdef FORCE_VORTEXM4 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "VORTEXM4" +#define SUBDIRNAME "arm64" +#ifdef __clang__ +#define ARCHCONFIG "-DVORTEXM4 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SME -DARMV8" +#else +#define ARCHCONFIG "-DVORTEX " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#endif +#define LIBNAME "vortexm4" +#define CORENAME "VORTEXM4" +#endif + #ifdef FORCE_A64FX #define ARMV8 #define FORCE @@ -1820,7 +1875,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "CK860FV" #endif - #ifndef FORCE #ifdef USER_TARGET @@ -2014,10 +2068,9 @@ int main(int argc, char *argv[]){ #endif -#ifdef INTEL_AMD -#ifndef FORCE +#if defined(INTEL_AMD) && !defined(FORCE) get_sse(); -#else +#elif defined(FORCE_INTEL) sprintf(buffer, "%s", ARCHCONFIG); @@ -2047,7 +2100,6 @@ int main(int argc, char *argv[]){ } else p ++; } #endif -#endif #if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ printf("__BYTE_ORDER__=__ORDER_BIG_ENDIAN__\n"); diff --git a/getarch_2nd.c b/getarch_2nd.c index dd1f830895..2085556bd6 100644 --- a/getarch_2nd.c +++ b/getarch_2nd.c @@ -1,3 +1,31 @@ +/*************************************************************************** + * Copyright (c) 2025, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + #include #ifndef BUILD_KERNEL #include "config.h" @@ -17,8 +45,14 @@ typedef unsigned long BLASULONG; int main(int argc, char **argv) { if ( (argc <= 1) || ((argc >= 2) && (*argv[1] == '0'))) { + printf("BGEMM_UNROLL_M=%d\n", BGEMM_DEFAULT_UNROLL_M); + printf("BGEMM_UNROLL_N=%d\n", BGEMM_DEFAULT_UNROLL_N); + printf("BGEMM_UNROLL_M=%d\n", BGEMM_DEFAULT_UNROLL_M); + printf("BGEMM_UNROLL_N=%d\n", BGEMM_DEFAULT_UNROLL_N); printf("SBGEMM_UNROLL_M=%d\n", SBGEMM_DEFAULT_UNROLL_M); printf("SBGEMM_UNROLL_N=%d\n", SBGEMM_DEFAULT_UNROLL_N); + printf("SHGEMM_UNROLL_M=%d\n", SHGEMM_DEFAULT_UNROLL_M); + printf("SHGEMM_UNROLL_N=%d\n", SHGEMM_DEFAULT_UNROLL_N); printf("SGEMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M); printf("SGEMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N); printf("DGEMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M); diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index eb2bce3f05..ee7d40d382 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -1,3 +1,30 @@ +############################################################################### +# Copyright (c) 2025, The OpenBLAS Project +# All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# 3. Neither the name of the OpenBLAS project nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +############################################################################### include_directories(${PROJECT_SOURCE_DIR}) include_directories(${PROJECT_BINARY_DIR}) @@ -97,10 +124,9 @@ foreach (CBLAS_FLAG ${CBLAS_FLAGS}) #sdsdot, dsdot if (BUILD_SINGLE OR BUILD_DOUBLE) GenerateNamedObjects("sdsdot.c" "" "sdsdot" ${CBLAS_FLAG} "" "" true "SINGLE") - if(CBLAS_FLAG EQUAL 1) - GenerateNamedObjects("gemm_batch.c" "" "gemm_batch" ${CBLAS_FLAG} "" "" false) -endif () -endif () + GenerateNamedObjects("gemm_batch.c" "" "gemm_batch" ${CBLAS_FLAG} "" "" false) + GenerateNamedObjects("gemm_batch_strided.c" "" "gemm_batch_strided" ${CBLAS_FLAG} "" "" false) + endif () if (BUILD_DOUBLE) GenerateNamedObjects("dsdot.c" "" "dsdot" ${CBLAS_FLAG} "" "" true "SINGLE") endif () @@ -123,18 +149,24 @@ endif () GenerateNamedObjects("imax.c" "USE_MIN" "i*min" ${CBLAS_FLAG}) if (BUILD_BFLOAT16) + GenerateNamedObjects("scal.c" "BGEMM" "bscal" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("gemm.c" "BGEMM" "bgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("sbgemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("sbgemmt.c" "RNAME" "sbgemmtr" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("sbgemv.c" "BGEMM" "bgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("bf16to.c" "SINGLE_PREC" "sbf16tos" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("bf16to.c" "DOUBLE_PREC" "dbf16tod" ${CBLAS_FLAG} "" "" true "BFLOAT16") - if(CBLAS_FLAG EQUAL 1) GenerateNamedObjects("gemm_batch.c" "" "sbgemm_batch" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("gemm_batch_strided.c" "" "sbgemm_batch_strided" ${CBLAS_FLAG} "" "" true "BFLOAT16") endif () +if (BUILD_HFLOAT16) + GenerateNamedObjects("gemm.c" "" "shgemm" ${CBLAS_FLAG} "" "" true "HFLOAT16") + GenerateNamedObjects("sbgemv.c" "" "shgemv" ${CBLAS_FLAG} "" "" true "HFLOAT16") endif () # complex-specific sources @@ -164,9 +196,8 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("max.c" "USE_ABS" "scamax" ${CBLAS_FLAG} "" "" true "COMPLEX") GenerateNamedObjects("asum.c" "" "scasum" ${CBLAS_FLAG} "" "" true "COMPLEX") GenerateNamedObjects("sum.c" "" "scsum" ${CBLAS_FLAG} "" "" true "COMPLEX") - if(CBLAS_FLAG EQUAL 1) - GenerateNamedObjects("gemm_batch.c" "" "cgemm_batch" ${CBLAS_FLAG} "" "" true "COMPLEX") - endif () + GenerateNamedObjects("gemm_batch.c" "" "cgemm_batch" ${CBLAS_FLAG} "" "" true "COMPLEX") + GenerateNamedObjects("gemm_batch_strided.c" "" "cgemm_batch_strided" ${CBLAS_FLAG} "" "" true "COMPLEX") endif () if (${float_type} STREQUAL "ZCOMPLEX") GenerateNamedObjects("zscal.c" "SSCAL" "dscal" ${CBLAS_FLAG} "" "" false "ZCOMPLEX") @@ -176,9 +207,8 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("max.c" "USE_ABS" "dzamax" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") GenerateNamedObjects("asum.c" "" "dzasum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") GenerateNamedObjects("sum.c" "" "dzsum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") - if(CBLAS_FLAG EQUAL 1) - GenerateNamedObjects("gemm_batch.c" "" "zgemm_batch" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") - endif () + GenerateNamedObjects("gemm_batch.c" "" "zgemm_batch" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") + GenerateNamedObjects("gemm_batch_strided.c" "" "zgemm_batch_strided" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") endif () endforeach () @@ -218,6 +248,7 @@ if (NOT NO_LAPACK) GenerateNamedObjects("lapack/lauu2.c" "" "" 0 "" "" 0 3) GenerateNamedObjects("lapack/trti2.c" "" "" 0 "" "" 0 3) endif() + GenerateNamedObjects("lapack/laed3.c" "" "" 0 "" "" 0 1) endif () if ( BUILD_COMPLEX AND NOT BUILD_SINGLE) @@ -228,7 +259,8 @@ if ( BUILD_COMPLEX AND NOT BUILD_SINGLE) GenerateNamedObjects("nrm2.c" "" "nrm2" 0 "" "" false "SINGLE") GenerateNamedObjects("gemv.c" "" "gemv" 0 "" "" false "SINGLE") GenerateNamedObjects("gemm.c" "" "gemm" 0 "" "" false "SINGLE") - GenerateNamedObjects("gemm_batch.c" "" "gemm_batch" 1 "" "" false "SINGLE") + GenerateNamedObjects("gemm_batch.c" "" "gemm_batch" 0 "" "" false "SINGLE") + GenerateNamedObjects("gemm_batch_strided.c" "" "gemm_batch_strided" 0 "" "" false "SINGLE") GenerateNamedObjects("asum.c" "" "asum" 0 "" "" false "SINGLE") GenerateNamedObjects("swap.c" "" "swap" 0 "" "" false "SINGLE") GenerateNamedObjects("axpy.c" "" "axpy" 0 "" "" false "SINGLE") @@ -242,13 +274,19 @@ if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) GenerateNamedObjects("nrm2.c" "" "nrm2" 0 "" "" false "DOUBLE") GenerateNamedObjects("gemv.c" "" "gemv" 0 "" "" false "DOUBLE") GenerateNamedObjects("gemm.c" "" "gemm" 0 "" "" false "DOUBLE") - GenerateNamedObjects("gemm_batch.c" "" "gemm_batch" 1 "" "" false "DOUBLE") + GenerateNamedObjects("gemm_batch.c" "" "gemm_batch" 0 "" "" false "DOUBLE") + GenerateNamedObjects("gemm_batch_strided.c" "" "gemm_batch_strided" 0 "" "" false "DOUBLE") GenerateNamedObjects("asum.c" "" "asum" 0 "" "" false "DOUBLE") GenerateNamedObjects("swap.c" "" "swap" 0 "" "" false "DOUBLE") GenerateNamedObjects("axpy.c" "" "axpy" 0 "" "" false "DOUBLE") GenerateNamedObjects("imax.c" "USE_ABS" "i*amax" 0 "" "" false "DOUBLE") endif () +if ( BUILD_BFLOAT16 AND NO_FBLAS ) + GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" 0 "" "" true "BFLOAT16") + GenerateNamedObjects("bf16to.c" "SINGLE_PREC" "sbf16tos" 0 "" "" true "BFLOAT16") +endif() + add_library(interface OBJECT ${OPENBLAS_SRC}) if (USE_OPENMP) diff --git a/interface/Makefile b/interface/Makefile index f09a6f46b9..83a894b125 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -1,3 +1,31 @@ +############################################################################### +# Copyright (c) 2025, The OpenBLAS Project +# All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# 3. Neither the name of the OpenBLAS project nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +############################################################################### + TOPDIR = .. include $(TOPDIR)/Makefile.system @@ -44,15 +72,24 @@ SBLAS3OBJS = \ sgemm.$(SUFFIX) ssymm.$(SUFFIX) strmm.$(SUFFIX) \ strsm.$(SUFFIX) ssyrk.$(SUFFIX) ssyr2k.$(SUFFIX) \ somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\ - sgeadd.$(SUFFIX) sgemmt.$(SUFFIX) sgemmtr.$(SUFFIX) + sgeadd.$(SUFFIX) sgemmt.$(SUFFIX) sgemmtr.$(SUFFIX) \ + sgemm_batch.$(SUFFIX) sgemm_batch_strided.$(SUFFIX) ifeq ($(BUILD_BFLOAT16),1) +BBLAS3OBJS = bgemm.$(SUFFIX) +BBLAS2OBJS = bgemv.$(SUFFIX) +BBLAS1OBJS = bscal.$(SUFFIX) SBBLAS1OBJS = sbdot.$(SUFFIX) SBBLAS2OBJS = sbgemv.$(SUFFIX) -SBBLAS3OBJS = sbgemm.$(SUFFIX) sbgemmt.$(SUFFIX) sbgemmtr.$(SUFFIX) +SBBLAS3OBJS = sbgemm.$(SUFFIX) sbgemmt.$(SUFFIX) sbgemmtr.$(SUFFIX) sbgemm_batch.$(SUFFIX) sbgemm_batch_strided.$(SUFFIX) SBEXTOBJS = sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX) endif +ifeq ($(BUILD_HFLOAT16),1) +SHBLAS3OBJS = shgemm.$(SUFFIX) +SHBLAS2OBJS = shgemv.$(SUFFIX) +endif + DBLAS1OBJS = \ daxpy.$(SUFFIX) dswap.$(SUFFIX) \ dcopy.$(SUFFIX) dscal.$(SUFFIX) \ @@ -76,7 +113,8 @@ DBLAS3OBJS = \ dgemm.$(SUFFIX) dsymm.$(SUFFIX) dtrmm.$(SUFFIX) \ dtrsm.$(SUFFIX) dsyrk.$(SUFFIX) dsyr2k.$(SUFFIX) \ domatcopy.$(SUFFIX) dimatcopy.$(SUFFIX)\ - dgeadd.$(SUFFIX) dgemmt.$(SUFFIX) dgemmtr.$(SUFFIX) + dgeadd.$(SUFFIX) dgemmt.$(SUFFIX) dgemmtr.$(SUFFIX) \ + dgemm_batch.$(SUFFIX) dgemm_batch_strided.$(SUFFIX) CBLAS1OBJS = \ caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \ @@ -105,7 +143,8 @@ CBLAS3OBJS = \ ctrsm.$(SUFFIX) csyrk.$(SUFFIX) csyr2k.$(SUFFIX) \ chemm.$(SUFFIX) cherk.$(SUFFIX) cher2k.$(SUFFIX) \ comatcopy.$(SUFFIX) cimatcopy.$(SUFFIX)\ - cgeadd.$(SUFFIX) cgemmt.$(SUFFIX) cgemmtr.$(SUFFIX) + cgeadd.$(SUFFIX) cgemmt.$(SUFFIX) cgemmtr.$(SUFFIX) \ + cgemm_batch.$(SUFFIX) cgemm_batch_strided.$(SUFFIX) ZBLAS1OBJS = \ zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \ @@ -134,7 +173,8 @@ ZBLAS3OBJS = \ ztrsm.$(SUFFIX) zsyrk.$(SUFFIX) zsyr2k.$(SUFFIX) \ zhemm.$(SUFFIX) zherk.$(SUFFIX) zher2k.$(SUFFIX) \ zomatcopy.$(SUFFIX) zimatcopy.$(SUFFIX)\ - zgeadd.$(SUFFIX) zgemmt.$(SUFFIX) zgemmtr.$(SUFFIX) + zgeadd.$(SUFFIX) zgemmt.$(SUFFIX) zgemmtr.$(SUFFIX) \ + zgemm_batch.$(SUFFIX) zgemm_batch_strided.$(SUFFIX) ifeq ($(SUPPORT_GEMM3M), 1) @@ -282,13 +322,24 @@ CSBLAS2OBJS = \ CSBLAS3OBJS = \ cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \ cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\ - cblas_sgeadd.$(SUFFIX) cblas_sgemmt.$(SUFFIX) cblas_sgemmtr.$(SUFFIX) cblas_sgemm_batch.$(SUFFIX) + cblas_sgeadd.$(SUFFIX) cblas_sgemmt.$(SUFFIX) cblas_sgemmtr.$(SUFFIX) cblas_sgemm_batch.$(SUFFIX) cblas_sgemm_batch_strided.$(SUFFIX) ifeq ($(BUILD_BFLOAT16),1) +CBBLAS3OBJS = cblas_bgemm.$(SUFFIX) +CBBLAS2OBJS = cblas_bgemv.$(SUFFIX) +CBBLAS1OBJS = cblas_bscal.$(SUFFIX) CSBBLAS1OBJS = cblas_sbdot.$(SUFFIX) CSBBLAS2OBJS = cblas_sbgemv.$(SUFFIX) -CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) cblas_sbgemmt.$(SUFFIX) cblas_sbgemmtr.$(SUFFIX) cblas_sbgemm_batch.$(SUFFIX) +CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) cblas_sbgemmt.$(SUFFIX) cblas_sbgemmtr.$(SUFFIX) cblas_sbgemm_batch.$(SUFFIX) cblas_sbgemm_batch_strided.$(SUFFIX) CSBEXTOBJS = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX) +ifeq ($(ONLY_CBLAS),1) +CSBEXTOBJS += sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX) +endif +endif + +ifeq ($(BUILD_HFLOAT16),1) +CSHBLAS3OBJS = cblas_shgemm.$(SUFFIX) +CSHBLAS2OBJS = cblas_shgemv.$(SUFFIX) endif CDBLAS1OBJS = \ @@ -308,7 +359,7 @@ CDBLAS2OBJS = \ CDBLAS3OBJS += \ cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \ cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) cblas_dimatcopy.$(SUFFIX) \ - cblas_dgeadd.$(SUFFIX) cblas_dgemmt.$(SUFFIX) cblas_dgemmtr.$(SUFFIX) cblas_dgemm_batch.$(SUFFIX) + cblas_dgeadd.$(SUFFIX) cblas_dgemmt.$(SUFFIX) cblas_dgemmtr.$(SUFFIX) cblas_dgemm_batch.$(SUFFIX) cblas_dgemm_batch_strided.$(SUFFIX) CCBLAS1OBJS = \ cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ @@ -333,7 +384,7 @@ CCBLAS3OBJS = \ cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \ cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \ cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)\ - cblas_cgeadd.$(SUFFIX) cblas_cgemmt.$(SUFFIX) cblas_cgemmtr.$(SUFFIX) cblas_cgemm_batch.$(SUFFIX) + cblas_cgeadd.$(SUFFIX) cblas_cgemmt.$(SUFFIX) cblas_cgemmtr.$(SUFFIX) cblas_cgemm_batch.$(SUFFIX) cblas_cgemm_batch_strided.$(SUFFIX) CXERBLAOBJ = \ cblas_xerbla.$(SUFFIX) @@ -364,7 +415,7 @@ CZBLAS3OBJS = \ cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \ cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX)\ cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX) \ - cblas_zgeadd.$(SUFFIX) cblas_zgemmt.$(SUFFIX) cblas_zgemmtr.$(SUFFIX) cblas_zgemm_batch.$(SUFFIX) + cblas_zgeadd.$(SUFFIX) cblas_zgemmt.$(SUFFIX) cblas_zgemmtr.$(SUFFIX) cblas_zgemm_batch.$(SUFFIX) cblas_zgemm_batch_strided.$(SUFFIX) ifeq ($(SUPPORT_GEMM3M), 1) @@ -385,9 +436,14 @@ override CFLAGS += -I. SBLAS1OBJS += $(CSBLAS1OBJS) SBLAS2OBJS += $(CSBLAS2OBJS) SBLAS3OBJS += $(CSBLAS3OBJS) +BBLAS3OBJS += $(CBBLAS3OBJS) +BBLAS2OBJS += $(CBBLAS2OBJS) +BBLAS1OBJS += $(CBBLAS1OBJS) SBBLAS1OBJS += $(CSBBLAS1OBJS) SBBLAS2OBJS += $(CSBBLAS2OBJS) SBBLAS3OBJS += $(CSBBLAS3OBJS) +SHBLAS3OBJS += $(CSHBLAS3OBJS) +SHBLAS2OBJS += $(CSHBLAS2OBJS) DBLAS1OBJS += $(CDBLAS1OBJS) DBLAS2OBJS += $(CDBLAS2OBJS) DBLAS3OBJS += $(CDBLAS3OBJS) @@ -403,8 +459,10 @@ SBEXTOBJS += $(CSBEXTOBJS) CBAUXOBJS += $(CXERBLAOBJ) endif +BBLASOBJS = $(BBLAS3OBJS) $(BBLAS2OBJS) $(BBLAS1OBJS) SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS) SBBLASOBJS = $(SBBLAS1OBJS) $(SBBLAS2OBJS) $(SBBLAS3OBJS) +SHBLASOBJS = $(SHBLAS3OBJS) $(SHBLAS2OBJS) DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS) QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS) CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS) @@ -419,8 +477,8 @@ XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS) SLAPACKOBJS = \ sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \ spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) slauu2.$(SUFFIX) \ - slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) strtrs.$(SUFFIX) - + slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) strtrs.$(SUFFIX) \ + slaed3.$(SUFFIX) #DLAPACKOBJS = \ # dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \ @@ -430,8 +488,8 @@ SLAPACKOBJS = \ DLAPACKOBJS = \ dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \ dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) dlauu2.$(SUFFIX) \ - dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) dtrtrs.$(SUFFIX) - + dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) dtrtrs.$(SUFFIX) \ + dlaed3.$(SUFFIX) QLAPACKOBJS = \ qgetf2.$(SUFFIX) qgetrf.$(SUFFIX) qlauu2.$(SUFFIX) qlauum.$(SUFFIX) \ @@ -512,7 +570,7 @@ ifneq ($(BUILD_COMPLEX16),1) ZBLASOBJS= endif -FUNCOBJS = $(SBEXTOBJS) $(CXERBLAOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) +FUNCOBJS = $(SBEXTOBJS) $(CXERBLAOBJS) $(BBLASOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) $(SHBLASOBJS) ifeq ($(EXPRECISION), 1) FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) @@ -547,10 +605,10 @@ clean :: level1 : $(SBEXTOBJS) $(SBBLAS1OBJS) $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $(XBLAS1OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ -level2 : $(SBBLAS2OBJS) $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS) +level2 : $(SBBLAS2OBJS) $(BBLAS2OBJS) $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS) $(SHBLAS2OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ -level3 : $(SBBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) +level3 : $(SBBLAS3OBJS) $(BBLAS3OBJ) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) $(SHBLAS3OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ aux : $(CBAUXOBJS) @@ -782,6 +840,8 @@ dsdot.$(SUFFIX) dsdot.$(PSUFFIX) : dsdot.c $(CC) $(CFLAGS) -c $< -o $(@F) ifeq ($(BUILD_BFLOAT16),1) +bscal.$(SUFFIX) bscal.$(PSUFFIX) : scal.c + $(CC) $(CFLAGS) -DBGEMM -c $< -o $(@F) sbdot.$(SUFFIX) sbdot.$(PSUFFIX) : bf16dot.c $(CC) $(CFLAGS) -c $< -o $(@F) sbstobf16.$(SUFFIX) sbstobf16.$(PSUFFIX) : tobf16.c @@ -939,10 +999,17 @@ xgerc.$(SUFFIX) xgerc.$(PSUFFIX) : zger.c $(CC) -c $(CFLAGS) -DCONJ $< -o $(@F) ifeq ($(BUILD_BFLOAT16),1) +bgemv.$(SUFFIX) bgemv.$(PSUFFIX) : sbgemv.c + $(CC) $(CFLAGS) -DBGEMM -c $< -o $(@F) sbgemv.$(SUFFIX) sbgemv.$(PSUFFIX) : sbgemv.c $(CC) $(CFLAGS) -c $< -o $(@F) endif +ifeq ($(BUILD_HFLOAT16),1) +shgemv.$(SUFFIX) shgemv.$(PSUFFIX) : sbgemv.c + $(CC) $(CFLAGS) -c $< -o $(@F) +endif + ifndef USE_NETLIB_GEMV sgemv.$(SUFFIX) sgemv.$(PSUFFIX): gemv.c $(CC) -c $(CFLAGS) -o $(@F) $< @@ -1301,6 +1368,8 @@ xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c $(CC) -c $(CFLAGS) $< -o $(@F) ifeq ($(BUILD_BFLOAT16),1) +bgemm.$(SUFFIX) bgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h @@ -1309,6 +1378,11 @@ sbgemmtr.$(SUFFIX) sbgemmtr.$(PSUFFIX) : sbgemmt.c ../param.h $(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) endif +ifeq ($(BUILD_HFLOAT16),1) +shgemm.$(SUFFIX) shgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) +endif + sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) @@ -1604,6 +1678,8 @@ cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) ifeq ($(BUILD_BFLOAT16),1) +cblas_bscal.$(SUFFIX) cblas_bscal.$(PSUFFIX) : scal.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_sbdot.$(SUFFIX) cblas_sbdot.$(PSUFFIX) : bf16dot.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_sbstobf16.$(SUFFIX) cblas_sbstobf16.$(PSUFFIX) : tobf16.c @@ -1758,10 +1834,17 @@ cblas_zdrot.$(SUFFIX) cblas_zdrot.$(PSUFFIX) : zrot.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) ifeq ($(BUILD_BFLOAT16),1) +cblas_bgemv.$(SUFFIX) cblas_bgemv.$(PSUFFIX) : sbgemv.c + $(CC) -DCBLAS -DBGEMM -c $(CFLAGS) $< -o $(@F) cblas_sbgemv.$(SUFFIX) cblas_sbgemv.$(PSUFFIX) : sbgemv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) endif +ifeq ($(BUILD_HFLOAT16),1) +cblas_shgemv.$(SUFFIX) cblas_shgemv.$(PSUFFIX) : sbgemv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) +endif + cblas_sgemv.$(SUFFIX) cblas_sgemv.$(PSUFFIX): gemv.c $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< @@ -1964,10 +2047,17 @@ cblas_sgemm.$(SUFFIX) cblas_sgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) ifeq ($(BUILD_BFLOAT16),1) +cblas_bgemm.$(SUFFIX) cblas_bgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -DCBLAS -DBGEMM -c $(CFLAGS) $< -o $(@F) cblas_sbgemm.$(SUFFIX) cblas_sbgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) endif +ifeq ($(BUILD_HFLOAT16),1) +cblas_shgemm.$(SUFFIX) cblas_shgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) +endif + cblas_dgemm.$(SUFFIX) cblas_dgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) @@ -2345,6 +2435,11 @@ zlarf.$(SUFFIX) zlarf.$(PSUFFIX) : larf.c xlarf.$(SUFFIX) xlarf.$(PSUFFIX) : larf.c $(CC) -c $(CFLAGS) $< -o $(@F) +slaed3.$(SUFFIX) slaed3.$(PSUFFIX) : lapack/laed3.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dlaed3.$(SUFFIX) dlaed3.$(PSUFFIX) : lapack/laed3.c + $(CC) -c $(CFLAGS) $< -o $(@F) ############# BLAS EXTENSIONS ##################################### @@ -2461,3 +2556,48 @@ cblas_cgemm_batch.$(SUFFIX) cblas_cgemm_batch.$(PSUFFIX) : gemm_batch.c ../param cblas_zgemm_batch.$(SUFFIX) cblas_zgemm_batch.$(PSUFFIX) : gemm_batch.c ../param.h $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +cblas_sbgemm_batch_strided.$(SUFFIX) cblas_sbgemm_batch_strided.$(PSUFFIX) : gemm_batch_strided.c ../param.h + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +cblas_sgemm_batch_strided.$(SUFFIX) cblas_sgemm_batch_strided.$(PSUFFIX) : gemm_batch_strided.c ../param.h + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +cblas_dgemm_batch_strided.$(SUFFIX) cblas_dgemm_batch_strided.$(PSUFFIX) : gemm_batch_strided.c ../param.h + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +cblas_cgemm_batch_strided.$(SUFFIX) cblas_cgemm_batch_strided.$(PSUFFIX) : gemm_batch_strided.c ../param.h + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +cblas_zgemm_batch_strided.$(SUFFIX) cblas_zgemm_batch_strided.$(PSUFFIX) : gemm_batch_strided.c ../param.h + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +sbgemm_batch.$(SUFFIX) sbgemm_batch.$(PSUFFIX) : gemm_batch.c ../param.h + $(CC) -c $(CFLAGS) -UCBLAS $< -o $(@F) + +sgemm_batch.$(SUFFIX) sgemm_batch.$(PSUFFIX) : gemm_batch.c ../param.h + $(CC) -c $(CFLAGS) -UCBLAS $< -o $(@F) + +dgemm_batch.$(SUFFIX) dgemm_batch.$(PSUFFIX) : gemm_batch.c ../param.h + $(CC) -c $(CFLAGS) -UCBLAS $< -o $(@F) + +cgemm_batch.$(SUFFIX) cgemm_batch.$(PSUFFIX) : gemm_batch.c ../param.h + $(CC) -c $(CFLAGS) -UCBLAS $< -o $(@F) + +zgemm_batch.$(SUFFIX) zgemm_batch.$(PSUFFIX) : gemm_batch.c ../param.h + $(CC) -c $(CFLAGS) -UCBLAS $< -o $(@F) + +sbgemm_batch_strided.$(SUFFIX) sbgemm_batch_strided.$(PSUFFIX) : gemm_batch_strided.c ../param.h + $(CC) -c $(CFLAGS) -UCBLAS $< -o $(@F) + +sgemm_batch_strided.$(SUFFIX) sgemm_batch_strided.$(PSUFFIX) : gemm_batch_strided.c ../param.h + $(CC) -c $(CFLAGS) -UCBLAS $< -o $(@F) + +dgemm_batch_strided.$(SUFFIX) dgemm_batch_strided.$(PSUFFIX) : gemm_batch_strided.c ../param.h + $(CC) -c $(CFLAGS) -UCBLAS $< -o $(@F) + +cgemm_batch_strided.$(SUFFIX) cgemm_batch_strided.$(PSUFFIX) : gemm_batch_strided.c ../param.h + $(CC) -c $(CFLAGS) -UCBLAS $< -o $(@F) + +zgemm_batch_strided.$(SUFFIX) zgemm_batch_strided.$(PSUFFIX) : gemm_batch_strided.c ../param.h + $(CC) -c $(CFLAGS) -UCBLAS $< -o $(@F) diff --git a/interface/gemm.c b/interface/gemm.c index 54e5604fd3..4b1c9a93cc 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -54,8 +54,15 @@ #define ERROR_NAME "DGEMM " #define GEMV BLASFUNC(dgemv) #elif defined(BFLOAT16) +#ifdef BGEMM +#define ERROR_NAME "BGEMM " +#define GEMV BLASFUNC(bgemv) +#else #define ERROR_NAME "SBGEMM " #define GEMV BLASFUNC(sbgemv) +#endif +#elif defined(HFLOAT16) +#define ERROR_NAME "SHGEMM " #else #define ERROR_NAME "SGEMM " #define GEMV BLASFUNC(sgemv) @@ -111,7 +118,7 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B #endif }; -#if defined(SMALL_MATRIX_OPT) && !defined(GEMM3M) && !defined(XDOUBLE) +#if defined(SMALL_MATRIX_OPT) && !defined(GEMM3M) && !defined(XDOUBLE) && !defined(HFLOAT16) && !defined(BGEMM) #define USE_SMALL_MATRIX_OPT 1 #else #define USE_SMALL_MATRIX_OPT 0 @@ -219,11 +226,11 @@ static inline int get_gemm_optimal_nthreads_neoversev2(double MNK, int ncpu) { static inline int get_gemm_optimal_nthreads(double MNK) { int ncpu = num_cpu_avail(3); -#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) +#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16) return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); -#elif defined(NEOVERSEV2) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) +#elif defined(NEOVERSEV2) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16) return get_gemm_optimal_nthreads_neoversev2(MNK, ncpu); -#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) +#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16) if (strcmp(gotoblas_corename(), "neoversev1") == 0) { return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); } @@ -259,6 +266,7 @@ void NAME(char *TRANSA, char *TRANSB, int transa, transb, nrowa, nrowb; blasint info; + int order = -1; char transA, transB; IFLOAT *buffer; @@ -417,27 +425,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS PRINT_DEBUG_CNAME; -#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) -#if defined(ARCH_x86) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) -#if defined(DYNAMIC_ARCH) - if (support_avx512() ) -#endif - if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) { - SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); - return; - } -#endif -#if defined(ARCH_ARM64) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) -#if defined(DYNAMIC_ARCH) - if (support_sme1()) -#endif - if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans) { - SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); - return; - } -#endif -#endif - #ifndef COMPLEX args.alpha = (void *)α args.beta = (void *)β @@ -554,6 +541,40 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS return; } + + if ((args.m == 0) || (args.n == 0)) return; +#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16) +#if defined(ARCH_x86) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) +#if defined(DYNAMIC_ARCH) + if (support_avx512() ) +#endif + if (order == CblasRowMajor && beta == 0 && alpha == 1.0 && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) { + SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); + return; + } +#endif +#if defined(ARCH_ARM64) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) +#if defined(DYNAMIC_ARCH) +if (strcmp(gotoblas_corename(), "armv9sme") == 0 +#if defined(__clang__) + || strcmp(gotoblas_corename(), "vortexm4") == 0 +#endif +) +// if (support_sme1()) +#endif + if (order == CblasRowMajor && m==lda && n ==ldb && k==ldc && beta == 0 && alpha == 1.0 && TransA == CblasNoTrans && TransB == CblasNoTrans&& SGEMM_DIRECT_PERFORMANT(m,n,k)) { + SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); + return; + } +else + if (order == CblasRowMajor && m==lda && n==ldb && k==ldc && TransA == CblasNoTrans && TransB == CblasNoTrans&& SGEMM_DIRECT_PERFORMANT(m,n,k)) { + SGEMM_DIRECT_ALPHA_BETA(m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + return; + } + +#endif +#endif + #endif #if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16) @@ -577,7 +598,18 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS args.m, args.n, args.k, args.lda, args.ldb, args.ldc); #endif -#if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) && (!defined(BFLOAT16) || defined(GEMM_GEMV_FORWARD_BF16)) +#if (!defined(BFLOAT16) || (!defined(BGEMM) && defined(SBGEMM_GEMV_FORWARD)) || (defined(BGEMM) && defined(BGEMM_GEMV_FORWARD))) +#define BFLOAT16_GEMM_GEMV_FORWARD 1 +#else +#define BFLOAT16_GEMM_GEMV_FORWARD 0 +#endif +#if (!defined(HFLOAT16) || (!defined(HGEMM) && defined(SHGEMM_GEMV_FORWARD)) || (defined(HGEMM) && defined(HGEMM_GEMV_FORWARD))) +#define HFLOAT16_GEMM_GEMV_FORWARD 1 +#else +#define HFLOAT16_GEMM_GEMV_FORWARD 0 +#endif + +#if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) && HFLOAT16_GEMM_GEMV_FORWARD && BFLOAT16_GEMM_GEMV_FORWARD #if defined(ARCH_ARM64) // The gemv kernels in arm64/{gemv_n.S,gemv_n_sve.c,gemv_t.S,gemv_t_sve.c} // perform poorly in certain circumstances. We use the following boolean diff --git a/interface/gemm_batch.c b/interface/gemm_batch.c index 56ccc12ce4..0d88544091 100644 --- a/interface/gemm_batch.c +++ b/interface/gemm_batch.c @@ -114,6 +114,17 @@ static size_t zgemm_small_kernel_b0[] = { #endif #endif +#ifndef CBLAS +void NAME(char *transa_array, char *transb_array, + blasint * m_array, blasint * n_array, blasint * k_array, + FLOAT * alpha_array, + IFLOAT ** a_array, blasint * lda_array, + IFLOAT ** b_array, blasint * ldb_array, + FLOAT * beta_array, + FLOAT ** c_array, blasint * ldc_array, blasint * gcount, blasint * group_size) { + blasint group_count = *gcount; +#else + void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE * transa_array, enum CBLAS_TRANSPOSE * transb_array, blasint * m_array, blasint * n_array, blasint * k_array, #ifndef COMPLEX @@ -134,8 +145,11 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE * transa_array, enum CB FLOAT ** a_array=(FLOAT**)va_array; FLOAT ** b_array=(FLOAT**)vb_array; FLOAT ** c_array=(FLOAT**)vc_array; - #endif +#endif + BLASLONG group_m, group_n, group_k; + BLASLONG group_lda, group_ldb, group_ldc; + blas_arg_t * args_array=NULL; int mode=0, group_mode=0; @@ -148,8 +162,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE * transa_array, enum CB blasint info; void * group_alpha, * group_beta; - BLASLONG group_m, group_n, group_k; - BLASLONG group_lda, group_ldb, group_ldc; void * group_routine=NULL; #ifdef SMALL_MATRIX_OPT void * group_small_matrix_opt_routine=NULL; @@ -201,7 +213,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE * transa_array, enum CB group_transa = -1; group_transb = -1; info = 0; - + +#if defined(CBLAS) if (order == CblasColMajor) { group_m = m_array[i]; group_n = n_array[i]; @@ -254,7 +267,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE * transa_array, enum CB group_lda = ldb_array[i]; group_ldb = lda_array[i]; group_ldc = ldc_array[i]; - + if (transb_array[i] == CblasNoTrans) group_transa = 0; if (transb_array[i] == CblasTrans) group_transa = 1; #ifndef COMPLEX @@ -273,6 +286,32 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE * transa_array, enum CB if (transa_array[i] == CblasConjNoTrans) group_transb = 2; if (transa_array[i] == CblasConjTrans) group_transb = 3; #endif + +#else + group_m = m_array[i]; + group_n = n_array[i]; + group_k = k_array[i]; + + group_lda = lda_array[i]; + group_ldb = ldb_array[i]; + group_ldc = ldc_array[i]; + + if (transb_array[i] == 'N') group_transa = 0; + if (transb_array[i] == 'T') group_transa = 1; +#ifndef COMPLEX + if (transb_array[i] == 'C') group_transa = 1; +#else + if (transb_array[i] == 'C') group_transa = 3; +#endif + if (transa_array[i] == 'N') group_transb = 0; + if (transa_array[i] == 'T') group_transb = 1; +#ifndef COMPLEX + if (transa_array[i] == 'C') group_transb = 1; +#else + if (transa_array[i] == 'C') group_transb = 3; +#endif +#endif + group_nrowa = group_m; if (group_transa & 1) group_nrowa = group_k; group_nrowb = group_k; @@ -288,7 +327,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE * transa_array, enum CB if (group_m < 0) info = 3; if (group_transb < 0) info = 2; if (group_transa < 0) info = 1; +#if defined(CBLAS) } +#endif if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); @@ -344,13 +385,17 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE * transa_array, enum CB args_array[count].alpha=group_alpha; args_array[count].beta=group_beta; +#if defined(CBLAS) if (order == CblasColMajor) { args_array[count].a=(a_array[matrix_idx+j]); args_array[count].b=(b_array[matrix_idx+j]); }else if(order == CblasRowMajor){ +#endif args_array[count].a=(b_array[matrix_idx+j]); args_array[count].b=(a_array[matrix_idx+j]); +#if defined(CBLAS) } +#endif args_array[count].c=(c_array[matrix_idx+j]); diff --git a/interface/gemm_batch_strided.c b/interface/gemm_batch_strided.c new file mode 100644 index 0000000000..8435b65502 --- /dev/null +++ b/interface/gemm_batch_strided.c @@ -0,0 +1,425 @@ +/***************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include +#include +#include "common.h" + +void openblas_warning(int verbose, const char * msg); + +#ifndef COMPLEX +#ifdef XDOUBLE +#define ERROR_NAME "QGEMM_BATCH_STRIDED " +#elif defined(DOUBLE) +#define ERROR_NAME "DGEMM_BATCH_STRIDED " +#define GEMM_BATCH_THREAD dgemm_batch_thread +#else +#define ERROR_NAME "SGEMM_BATCH_STRIDED " +#define GEMM_BATCH_THREAD sgemm_batch_thread +#endif +#else +#ifdef XDOUBLE +#define ERROR_NAME "XGEMM_BATCH_STRIDED " +#elif defined(DOUBLE) +#define ERROR_NAME "ZGEMM_BATCH_STRIDED " +#define GEMM_BATCH_THREAD zgemm_batch_thread +#else +#define ERROR_NAME "CGEMM_BATCH_STRIDED " +#define GEMM_BATCH_THREAD cgemm_batch_thread +#endif +#endif +static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, BLASLONG) = { + GEMM_NN, GEMM_TN, GEMM_RN, GEMM_CN, + GEMM_NT, GEMM_TT, GEMM_RT, GEMM_CT, + GEMM_NR, GEMM_TR, GEMM_RR, GEMM_CR, + GEMM_NC, GEMM_TC, GEMM_RC, GEMM_CC, +}; + +#if defined(SMALL_MATRIX_OPT) && !defined(GEMM3M) && !defined(XDOUBLE) +#define USE_SMALL_MATRIX_OPT 1 +#else +#define USE_SMALL_MATRIX_OPT 0 +#endif + +#if USE_SMALL_MATRIX_OPT +#ifndef DYNAMIC_ARCH +#define SMALL_KERNEL_ADDR(table, idx) ((void *)(table[idx])) +#else +#define SMALL_KERNEL_ADDR(table, idx) ((void *)(*(uintptr_t *)((char *)gotoblas + (size_t)(table[idx])))) +#endif + + +#ifndef COMPLEX +static size_t gemm_small_kernel[] = { + GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, 0, 0, + GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, 0, 0, +}; + + +static size_t gemm_small_kernel_b0[] = { + GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, 0, 0, + GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, 0, 0, +}; + +#define GEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, IFLOAT *, BLASLONG, FLOAT, IFLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel_b0, (idx)) +#define GEMM_SMALL_KERNEL(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, IFLOAT *, BLASLONG, FLOAT, IFLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel, (idx)) +#else + +static size_t zgemm_small_kernel[] = { + GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, GEMM_SMALL_KERNEL_RN, GEMM_SMALL_KERNEL_CN, + GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, GEMM_SMALL_KERNEL_RT, GEMM_SMALL_KERNEL_CT, + GEMM_SMALL_KERNEL_NR, GEMM_SMALL_KERNEL_TR, GEMM_SMALL_KERNEL_RR, GEMM_SMALL_KERNEL_CR, + GEMM_SMALL_KERNEL_NC, GEMM_SMALL_KERNEL_TC, GEMM_SMALL_KERNEL_RC, GEMM_SMALL_KERNEL_CC, +}; + +static size_t zgemm_small_kernel_b0[] = { + GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, GEMM_SMALL_KERNEL_B0_RN, GEMM_SMALL_KERNEL_B0_CN, + GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, GEMM_SMALL_KERNEL_B0_RT, GEMM_SMALL_KERNEL_B0_CT, + GEMM_SMALL_KERNEL_B0_NR, GEMM_SMALL_KERNEL_B0_TR, GEMM_SMALL_KERNEL_B0_RR, GEMM_SMALL_KERNEL_B0_CR, + GEMM_SMALL_KERNEL_B0_NC, GEMM_SMALL_KERNEL_B0_TC, GEMM_SMALL_KERNEL_B0_RC, GEMM_SMALL_KERNEL_B0_CC, +}; + +#define ZGEMM_SMALL_KERNEL(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(zgemm_small_kernel, (idx)) +#define ZGEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(zgemm_small_kernel_b0, (idx)) +#endif +#endif + +#ifndef CBLAS +void NAME(char *transa, char *transb, + blasint * M, blasint * N, blasint * K, + FLOAT * Alpha, + IFLOAT * a, blasint * Lda, + blasint * stride_a, + IFLOAT *b, blasint * Ldb, + blasint * stride_b, + FLOAT * Beta, + FLOAT * c, blasint * Ldc, blasint * stride_c, blasint * matcount) { + + char ta = *transa; + char tb = *transb; + blasint count = *matcount; + blasint stridea= *stride_a; + blasint strideb= *stride_b; + blasint stridec= *stride_c; + blasint m=*M; + blasint n=*N; + blasint k=*K; + blasint lda=*Lda; + blasint ldb=*Ldb; + blasint ldc=*Ldc; +#if !defined(COMPLEX) + FLOAT alpha=*Alpha; + FLOAT beta=*Beta; +#else + FLOAT *alpha=Alpha; + FLOAT *beta=Beta; +#endif +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE transa, enum CBLAS_TRANSPOSE transb, + blasint m, blasint n, blasint k, +#ifndef COMPLEX + FLOAT alpha, + IFLOAT * a, blasint lda, blasint stridea, + IFLOAT * b, blasint ldb, blasint strideb, + FLOAT beta, + FLOAT * c, blasint ldc, blasint stridec, blasint count) { +#else + void * valpha, + void * va, blasint lda, blasint stridea, + void * vb, blasint ldb, blasint strideb, + void * vbeta, + void * vc, blasint ldc, blasint stridec, blasint count) { + + FLOAT * alpha=(FLOAT *)valpha; + FLOAT * beta=(FLOAT *)vbeta; + FLOAT * a=(FLOAT*)va; + FLOAT * b=(FLOAT*)vb; + FLOAT * c=(FLOAT*)vc; +#endif +#endif + BLASLONG group_m, group_n, group_k; + BLASLONG group_lda, group_ldb, group_ldc; + + blas_arg_t * args_array=NULL; + + int mode=0, group_mode=0; + + blasint i=0; + + int group_transa, group_transb; + BLASLONG group_nrowa, group_nrowb; + blasint info; + + void * group_routine=NULL; +#ifdef SMALL_MATRIX_OPT + void * group_small_matrix_opt_routine=NULL; +#endif + +#if defined (SMP) || defined(SMALL_MATRIX_OPT) + double MNK; +#endif + + PRINT_DEBUG_CNAME; + + args_array=(blas_arg_t *)malloc(count * sizeof(blas_arg_t)); + + if(args_array == NULL){ + openblas_warning(0, "memory alloc failed!\n"); + return; + } + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + for(i=0; i= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + free(args_array); + return; + } + + if (group_m == 0 || group_n == 0) continue; + + group_mode=mode; + +#if defined(SMP) || defined(SMALL_MATRIX_OPT) + MNK = (double) group_m * (double) group_n * (double) group_k; +#endif + +#ifdef SMALL_MATRIX_OPT + if (MNK <= 100.0*100.0*100.0){ + group_routine=NULL; +#if !defined(COMPLEX) + if(beta == 0.0){ + group_mode=mode | BLAS_SMALL_B0_OPT; + group_small_matrix_opt_routine=(void *)(gemm_small_kernel_b0[(group_transb<<2)|group_transa]); + }else{ + group_mode=mode | BLAS_SMALL_OPT; + group_small_matrix_opt_routine=(void *)(gemm_small_kernel[(group_transb<<2)|group_transa]); + } +#else + if(beta[0] == 0.0 && beta[1] == 0.0){ + group_mode=mode | BLAS_SMALL_B0_OPT; + group_small_matrix_opt_routine=(void *)(zgemm_small_kernel_b0[(group_transb<<2)|group_transa]); + }else{ + group_mode=mode | BLAS_SMALL_OPT; + group_small_matrix_opt_routine=(void *)(zgemm_small_kernel[(group_transb<<2)|group_transa]); + } + +#endif + + }else{ +#endif + group_routine=(void*)(gemm[(group_transb<<2)|group_transa]); +#ifdef SMALL_MATRIX_OPT + } +#endif + + + args_array[i].m=group_m; + args_array[i].n=group_n; + args_array[i].k=group_k; + args_array[i].lda=group_lda; + args_array[i].ldb=group_ldb; + args_array[i].ldc=group_ldc; + args_array[i].alpha=α + args_array[i].beta=β + +#if defined(CBLAS) + if (order == CblasColMajor) { + args_array[i].a=&(a[i*stridea]); + args_array[i].b=&(b[i*strideb]); + }else if(order == CblasRowMajor){ + args_array[i].a=&(b[i*strideb]); + args_array[i].b=&(a[i*stridea]); + } +#else + args_array[i].a=&(a[i*stridea]); + args_array[i].b=&(b[i*strideb]); +#endif + + args_array[i].c= &c[i*stridec]; + + args_array[i].routine_mode=group_mode; + args_array[i].routine=group_routine; +#ifdef SMALL_MATRIX_OPT + if (!group_routine) + args_array[i].routine=group_small_matrix_opt_routine; +#endif + } + + if(count>0) { + GEMM_BATCH_THREAD(args_array,count); + } + + free(args_array); +} diff --git a/interface/gemv.c b/interface/gemv.c index 34b6addd3d..79589d8085 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -1,4 +1,5 @@ /*********************************************************************/ +/* Copyright 2025 The OpenBLAS Project */ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ @@ -81,9 +82,12 @@ static inline int get_gemv_optimal_nthreads_neoversev1(BLASLONG MN, int ncpu) { : (MN < 1050625L) ? MIN(ncpu, 40) : ncpu; #else - return (MN < 25600L) ? 1 + return + (MN < 25600L) ? 1 : (MN < 63001L) ? MIN(ncpu, 4) - : (MN < 459684L) ? MIN(ncpu, 16) + : (MN < 202500L) ? MIN(ncpu, 8) + : (MN < 806404L) ? MIN(ncpu, 16) + : (MN < 1638400L) ? MIN(ncpu, 32) : ncpu; #endif } @@ -93,9 +97,9 @@ static inline int get_gemv_optimal_nthreads_neoversev1(BLASLONG MN, int ncpu) { static inline int get_gemv_optimal_nthreads_neoversev2(BLASLONG MN, int ncpu) { return MN < 24964L ? 1 - : MN < 65536L ? MIN(ncpu, 8) - : MN < 262144L ? MIN(ncpu, 32) - : MN < 1638400L ? MIN(ncpu, 64) + : MN < 145924L ? MIN(ncpu, 8) + : MN < 692224L ? MIN(ncpu, 16) + : MN < 1638400L ? MIN(ncpu, 32) : ncpu; } #endif diff --git a/interface/ger.c b/interface/ger.c index af6ae8606a..640b8cb443 100644 --- a/interface/ger.c +++ b/interface/ger.c @@ -180,8 +180,10 @@ void CNAME(enum CBLAS_ORDER order, #ifdef SMPTEST // Threshold chosen so that speed-up is > 1 on a Xeon E5-2630 - if(1L * m * n > 2048L * GEMM_MULTITHREAD_THRESHOLD) + if(1L * m * n > 20480L * GEMM_MULTITHREAD_THRESHOLD) nthreads = num_cpu_avail(2); + else if(1L * m * n > 2048L * GEMM_MULTITHREAD_THRESHOLD) + nthreads = MIN(3,num_cpu_avail(2)); else nthreads = 1; diff --git a/interface/lapack/laed3.c b/interface/lapack/laed3.c new file mode 100644 index 0000000000..4e5215fcf7 --- /dev/null +++ b/interface/lapack/laed3.c @@ -0,0 +1,87 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if defined(DOUBLE) +#define ERROR_NAME "DLAED3" +#else +#define ERROR_NAME "SLAED3" +#endif + +/* ===================================================================== */ +int NAME(blasint *k, blasint *n, blasint *n1, FLOAT *d, + FLOAT *q, blasint *ldq, FLOAT *rho, FLOAT *dlamda, + FLOAT *q2, blasint *indx, blasint *ctot, FLOAT *w, + FLOAT *s, blasint *Info) +{ + blasint kval, nval, qdim, info; + + qdim = *ldq; + kval = *k; + nval = *n; + +/* Test the input parameters. */ + info = 0; + if (kval < 0) { + info = 1; + } else if (nval < kval) { + info = 2; + } else if (qdim < nval || qdim < 1) { + info = 6; + } + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1); + *Info = - info; + return 0; + } + +/* Quick return if possible */ + + *Info = 0; + if (kval == 0) return 0; + +#ifdef SMP + int nthreads = num_cpu_avail(4); + + if (nthreads == 1) { +#endif + LAED3_SINGLE(k, n, n1, d, q, ldq, rho, dlamda, q2, indx, ctot, w, s, Info); +#ifdef SMP + } else { + LAED3_PARALLEL(k, n, n1, d, q, ldq, rho, dlamda, q2, indx, ctot, w, s, Info); + } +#endif + + return 0; +} diff --git a/interface/lapack/trtri.c b/interface/lapack/trtri.c index df79f26656..82d806b8c2 100644 --- a/interface/lapack/trtri.c +++ b/interface/lapack/trtri.c @@ -127,10 +127,10 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In #endif #ifdef SMP -if (args.n <= 150) - args.nthreads = 1; -else - args.nthreads = num_cpu_avail(4); + if (args.n <= 150) + args.nthreads = 1; + else + args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif diff --git a/interface/sbgemmt.c b/interface/sbgemmt.c index 759af4bfb2..67914fe65f 100644 --- a/interface/sbgemmt.c +++ b/interface/sbgemmt.c @@ -1,5 +1,5 @@ /*********************************************************************/ -/* Copyright 2024, The OpenBLAS Project. */ +/* Copyright 2024-2025 The OpenBLAS Project. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -305,7 +305,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif int (*gemv[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *, BLASLONG, IFLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = { - SBGEMV_N, SBGEMV_T,}; + GEMV_N, GEMV_T,}; if (m == 0) diff --git a/interface/sbgemv.c b/interface/sbgemv.c index fce86f8e46..12db2dfb1c 100644 --- a/interface/sbgemv.c +++ b/interface/sbgemv.c @@ -1,4 +1,5 @@ /*********************************************************************/ +/* Copyright 2025 The OpenBLAS Project. */ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ @@ -43,17 +44,29 @@ #include "functable.h" #endif +#ifdef BGEMM +#define GEMV_THREAD_N bgemv_thread_n +#define GEMV_THREAD_T bgemv_thread_t +#define ERROR_NAME "BGEMV " +#elif defined(HFLOAT16) +#define GEMV_THREAD_N shgemv_thread_n +#define GEMV_THREAD_T shgemv_thread_t +#define ERROR_NAME "SHGEMV " +#else +#define GEMV_THREAD_N sbgemv_thread_n +#define GEMV_THREAD_T sbgemv_thread_t #define ERROR_NAME "SBGEMV " +#endif #ifdef SMP -static int (*sbgemv_thread[])(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 * , BLASLONG, float, float *, BLASLONG, int) = { - sbgemv_thread_n, sbgemv_thread_t, +static int (*gemv_thread[])(BLASLONG, BLASLONG, FLOAT, IFLOAT *, BLASLONG, IFLOAT * , BLASLONG, FLOAT, FLOAT *, BLASLONG, int) = { + GEMV_THREAD_N, GEMV_THREAD_T, }; #endif #ifndef CBLAS -void NAME(char *TRANS, blasint *M, blasint *N, float *ALPHA, bfloat16 *a, blasint *LDA, bfloat16 *x, blasint *INCX, float *BETA, float *y, blasint *INCY) +void NAME(char *TRANS, blasint *M, blasint *N, FLOAT *ALPHA, IFLOAT *a, blasint *LDA, IFLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY) { char trans = *TRANS; blasint m = *M; @@ -61,14 +74,14 @@ void NAME(char *TRANS, blasint *M, blasint *N, float *ALPHA, bfloat16 *a, blasin blasint lda = *LDA; blasint incx = *INCX; blasint incy = *INCY; - float alpha = *ALPHA; - float beta = *BETA; + FLOAT alpha = *ALPHA; + FLOAT beta = *BETA; #ifdef SMP int nthreads; #endif - int (*sbgemv[])(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 * , BLASLONG, float, float *, BLASLONG) = { - SBGEMV_N, SBGEMV_T, + int (*gemv[])(BLASLONG, BLASLONG, FLOAT, IFLOAT *, BLASLONG, IFLOAT * , BLASLONG, FLOAT, FLOAT *, BLASLONG) = { + GEMV_N, GEMV_T, }; blasint info; @@ -104,7 +117,7 @@ void NAME(char *TRANS, blasint *M, blasint *N, float *ALPHA, bfloat16 *a, blasin #else -void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint m, blasint n, float alpha, bfloat16 *a, blasint lda, bfloat16 *x, blasint incx, float beta, float *y, blasint incy) +void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint m, blasint n, FLOAT alpha, IFLOAT *a, blasint lda, IFLOAT *x, blasint incx, FLOAT beta, FLOAT *y, blasint incy) { blasint lenx, leny; int trans; @@ -113,8 +126,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint m, blasi int nthreads; #endif - int (*sbgemv[])(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 * , BLASLONG, float, float *, BLASLONG) = { - SBGEMV_N, SBGEMV_T, + int (*gemv[])(BLASLONG, BLASLONG, FLOAT, IFLOAT *, BLASLONG, IFLOAT * , BLASLONG, FLOAT, FLOAT *, BLASLONG) = { + GEMV_N, GEMV_T, }; PRINT_DEBUG_CNAME; @@ -166,8 +179,17 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint m, blasi leny = m; } - if (alpha == ZERO) { - if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); +#ifdef BGEMM + float alpha_float, beta_float; + SBF16TOS_K(1, &alpha, 1, &alpha_float, 1); + SBF16TOS_K(1, &beta, 1, &beta_float, 1); +#else + float alpha_float = alpha; + float beta_float = beta; +#endif + + if (alpha_float == ZERO) { + if (beta_float != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); return; } @@ -185,10 +207,10 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint m, blasi if (nthreads == 1) { #endif - (sbgemv[(int)trans])(m, n, alpha, a, lda, x, incx, beta, y, incy); + (gemv[(int)trans])(m, n, alpha, a, lda, x, incx, beta, y, incy); #ifdef SMP } else { - (sbgemv_thread[(int)trans])(m, n, alpha, a, lda, x, incx, beta, y, incy, nthreads); + (gemv_thread[(int)trans])(m, n, alpha, a, lda, x, incx, beta, y, incy, nthreads); } #endif diff --git a/interface/scal.c b/interface/scal.c index c6638a62d5..e03038ecbf 100644 --- a/interface/scal.c +++ b/interface/scal.c @@ -1,4 +1,5 @@ /*********************************************************************/ +/* Copyright 2025 The OpenBLAS Project. */ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ @@ -68,7 +69,14 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){ if (incx <= 0 || n <= 0) return; - if (alpha == ONE) return; +#ifdef BGEMM + float alpha_float; + SBF16TOS_K(1, &alpha, 1, &alpha_float, 1); +#else + FLOAT alpha_float = alpha; +#endif + + if (alpha_float == ONE) return; IDEBUG_START; diff --git a/interface/symm.c b/interface/symm.c index 3e6e0fd488..ea00217050 100644 --- a/interface/symm.c +++ b/interface/symm.c @@ -97,6 +97,9 @@ #define GEMM_MULTITHREAD_THRESHOLD 4 #endif +#ifdef DYNAMIC_ARCH +extern char* gotoblas_corename(void); +#endif #ifdef SMP #ifndef COMPLEX @@ -371,6 +374,28 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, return; } +#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16) +#if defined(ARCH_ARM64) && (defined(USE_SSYMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) +#if defined(DYNAMIC_ARCH) +if (strcmp(gotoblas_corename(), "armv9sme") == 0 +#if defined(__clang__) + || strcmp(gotoblas_corename(), "vortexm4") == 0 +#endif +) +#endif + if (args.m == 0 || args.n == 0) return; + if (order == CblasRowMajor && m == lda && n == ldb && n == ldc) + { + if (Side == CblasLeft && Uplo == CblasUpper) { + SSYMM_DIRECT_ALPHA_BETA_LU(m, n, alpha, a, lda, b, ldb, beta, c, ldc); return; + } + else if (Side == CblasLeft && Uplo == CblasLower) { + SSYMM_DIRECT_ALPHA_BETA_LL(m, n, alpha, a, lda, b, ldb, beta, c, ldc); return; + } + } +#endif +#endif + #endif if (args.m == 0 || args.n == 0) return; diff --git a/interface/syr2k.c b/interface/syr2k.c index 47df7f89f0..bfdc1c0297 100644 --- a/interface/syr2k.c +++ b/interface/syr2k.c @@ -345,9 +345,46 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr return; } + if (args.n == 0) return; + +#ifdef DYNAMIC_ARCH +extern char* gotoblas_corename(void); +#endif + +#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16) +#if defined(ARCH_ARM64) && (defined(USE_SSYR2K_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) +#if defined(DYNAMIC_ARCH) +if (strcmp(gotoblas_corename(), "armv9sme") == 0 +#if defined(__clang__) + || strcmp(gotoblas_corename(), "vortexm4") == 0 +#endif +) +#endif + if (order == CblasRowMajor && n == ldc) { + if (Trans == CblasNoTrans && k == lda && k == ldb) { + if (Uplo == CblasUpper) { + SSYR2K_DIRECT_ALPHA_BETA_UN(n, k, alpha, a, lda, b, ldb, beta, c, ldc); + return; + }else if (Uplo == CblasLower) { + SSYR2K_DIRECT_ALPHA_BETA_LN(n, k, alpha, a, lda, b, ldb, beta, c, ldc); + return; + } + } + else if (Trans == CblasTrans && n == lda && n ==ldb) { + if (Uplo == CblasUpper) { + SSYR2K_DIRECT_ALPHA_BETA_UT(n, k, alpha, a, lda, b, ldb, beta, c, ldc); + return; + }else if (Uplo == CblasLower) { + SSYR2K_DIRECT_ALPHA_BETA_LT(n, k, alpha, a, lda, b, ldb, beta, c, ldc); + return; + } + } + } +#endif +#endif + #endif - if (args.n == 0) return; IDEBUG_START; diff --git a/interface/syrk.c b/interface/syrk.c index 69f2328a44..3647ac5cfd 100644 --- a/interface/syrk.c +++ b/interface/syrk.c @@ -339,9 +339,35 @@ double NNK; return; } + if (args.n == 0) return; + +#ifdef DYNAMIC_ARCH +extern char* gotoblas_corename(void); +#endif + +#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16) +#if defined(ARCH_ARM64) && (defined(USE_SSYRK_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) +#if defined(DYNAMIC_ARCH) +if (strcmp(gotoblas_corename(), "armv9sme") == 0 +#if defined(__clang__) + || strcmp(gotoblas_corename(), "vortexm4") == 0 +#endif +) +#endif + if (order == CblasRowMajor && n == ldc) { + if (Trans == CblasNoTrans && k == lda) { + (Uplo == CblasUpper ? SSYRK_DIRECT_ALPHA_BETA_UN : SSYRK_DIRECT_ALPHA_BETA_LN)(n, k, alpha, a, lda, beta, c, ldc); + return; + } else if (Trans == CblasTrans && n == lda){ + (Uplo == CblasUpper ? SSYRK_DIRECT_ALPHA_BETA_UT : SSYRK_DIRECT_ALPHA_BETA_LT)(n, k, alpha, a, lda, beta, c, ldc); + return; + } + } +#endif +#endif + #endif - if (args.n == 0) return; IDEBUG_START; diff --git a/interface/trmv.c b/interface/trmv.c index 2e52527a3c..029a83499d 100644 --- a/interface/trmv.c +++ b/interface/trmv.c @@ -219,7 +219,10 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #ifdef SMP nthreads = num_cpu_avail(2); - + + if (n < 50 ) nthreads = 1; + if (nthreads > 2 && n < 500) nthreads = 2; + if (nthreads == 1) { #endif diff --git a/interface/trsm.c b/interface/trsm.c index 715c83a1f3..6584bb4d75 100644 --- a/interface/trsm.c +++ b/interface/trsm.c @@ -87,6 +87,10 @@ #define SMP_FACTOR 128 #endif +#ifdef DYNAMIC_ARCH +extern char* gotoblas_corename(void); +#endif + static int (*trsm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { #ifndef TRMM TRSM_LNUU, TRSM_LNUN, TRSM_LNLU, TRSM_LNLN, @@ -355,6 +359,27 @@ void CNAME(enum CBLAS_ORDER order, return; } +#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16) +#if defined(ARCH_ARM64) && (defined(USE_STRMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) +#if defined(DYNAMIC_ARCH) +if (strcmp(gotoblas_corename(), "armv9sme") == 0 +#if defined(__clang__) + || strcmp(gotoblas_corename(), "vortexm4") == 0 +#endif +) +#endif + if (args.m == 0 || args.n == 0) return; + if (order == CblasRowMajor && Diag == CblasNonUnit && Side == CblasLeft && m == lda && n == ldb) { + if (Trans == CblasNoTrans) { + (Uplo == CblasUpper ? STRMM_DIRECT_LNUN : STRMM_DIRECT_LNLN)(m, n, alpha, a, lda, b, ldb); + } else if (Trans == CblasTrans) { + (Uplo == CblasUpper ? STRMM_DIRECT_LTUN : STRMM_DIRECT_LTLN)(m, n, alpha, a, lda, b, ldb); + } + return; + } +#endif +#endif + #endif if ((args.m == 0) || (args.n == 0)) return; diff --git a/interface/zher.c b/interface/zher.c index 0d24984e60..49b1cec03b 100644 --- a/interface/zher.c +++ b/interface/zher.c @@ -177,7 +177,10 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP - nthreads = num_cpu_avail(2); + if (n < 100) + nthreads = 1; + else + nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif diff --git a/interface/zher2.c b/interface/zher2.c index 1cae633ce0..eba4c4f41b 100644 --- a/interface/zher2.c +++ b/interface/zher2.c @@ -186,7 +186,10 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, void *VALPHA buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP - nthreads = num_cpu_avail(2); + if (n < 100) + nthreads = 1; + else + nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif diff --git a/interface/zhpr.c b/interface/zhpr.c index 10507a71fa..34700549e3 100644 --- a/interface/zhpr.c +++ b/interface/zhpr.c @@ -175,7 +175,10 @@ void CNAME(enum CBLAS_ORDER order, buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP - nthreads = num_cpu_avail(2); + if (n < 100) + nthreads = 1; + else + nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif diff --git a/interface/zhpr2.c b/interface/zhpr2.c index c9bfb44b05..1cbaaa062b 100644 --- a/interface/zhpr2.c +++ b/interface/zhpr2.c @@ -187,7 +187,10 @@ void CNAME(enum CBLAS_ORDER order, buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP - nthreads = num_cpu_avail(2); + if (n < 100) + nthreads = 1; + else + nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 48c8955888..8932d3fe96 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -1,3 +1,30 @@ +############################################################################### +# Copyright (c) 2025, The OpenBLAS Project +# All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# 3. Neither the name of the OpenBLAS project nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +############################################################################### include_directories(${PROJECT_SOURCE_DIR}) @@ -94,6 +121,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) # sbdot if (BUILD_BFLOAT16) + GenerateNamedObjects("${KERNELDIR}/${BSCALKERNEL}" "BGEMM" "scal_k" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${SBDOTKERNEL}" "SBDOT" "dot_k" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${BF16TOKERNEL}" "SINGLE" "f16tos_k" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${BF16TOKERNEL}" "DOUBLE" "bf16tod_k" false "" "" false "DOUBLE") @@ -195,9 +223,15 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE") endif () if (BUILD_BFLOAT16) + GenerateNamedObjects("${KERNELDIR}/${BGEMVNKERNEL}" "BGEMM" "gemv_n" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${BGEMVTKERNEL}" "BGEMM" "gemv_t" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${SBGEMVNKERNEL}" "" "gemv_n" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${SBGEMVTKERNEL}" "" "gemv_t" false "" "" false "BFLOAT16") endif () + if (BUILD_HFLOAT16) + GenerateNamedObjects("${KERNELDIR}/${SHGEMVNKERNEL}" "" "gemv_n" false "" "" false "HFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SHGEMVTKERNEL}" "" "gemv_t" false "" "" false "HFLOAT16") + endif () # Makefile.L3 set(USE_TRMM false) string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE) @@ -207,10 +241,29 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10)) set(USE_TRMM true) endif () + set(USE_DIRECT_STRMM false) + if (ARM64) + set(USE_DIRECT_STRMM true) + endif() + set(USE_DIRECT_SSYRK false) + if (ARM64) + set(USE_DIRECT_SSYRK true) + endif() + set(USE_DIRECT_SSYR2K false) + if (ARM64) + set(USE_DIRECT_SSYR2K true) + endif() set(USE_DIRECT_SGEMM false) if (X86_64 OR ARM64) set(USE_DIRECT_SGEMM true) endif() + set(USE_DIRECT_SSYMM false) + if (ARM64) + set(USE_DIRECT_SSYMM true) + endif() + if (UC_TARGET_CORE MATCHES ARMV9SME OR UC_TARGET_CORE MATCHES VORTEXM4) + set (HAVE_SME true) + endif () if (USE_DIRECT_SGEMM) # if (NOT DEFINED SGEMMDIRECTKERNEL) @@ -221,17 +274,59 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE) GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE) elseif (ARM64) + set (SGEMMDIRECTPERFORMANT sgemm_direct_performant.c) set (SGEMMDIRECTKERNEL sgemm_direct_arm64_sme1.c) - set (SGEMMDIRECTSMEKERNEL sgemm_direct_sme1.S) + set (SGEMMDIRECTKERNEL_ALPHA_BETA sgemm_direct_alpha_beta_arm64_sme1.c) + set (SGEMMDIRECTSMEKERNEL sgemm_direct_sme1_2VLx2VL.S) set (SGEMMDIRECTPREKERNEL sgemm_direct_sme1_preprocess.S) + GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE) GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE) + GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL_ALPHA_BETA}" "" "gemm_direct_alpha_beta" false "" "" false SINGLE) if (HAVE_SME) - GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTSMEKERNEL}" "" "gemm_direct_sme1" false "" "" false SINGLE) + GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTSMEKERNEL}" "" "gemm_direct_sme1_2VLx2VL" false "" "" false SINGLE) GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPREKERNEL}" "" "gemm_direct_sme1_preprocess" false "" "" false SINGLE) endif () endif () endif() + if (USE_DIRECT_SSYMM) + if (ARM64) + set (SSYMMDIRECTKERNEL_ALPHA_BETA ssymm_direct_alpha_beta_arm64_sme1.c) + GenerateNamedObjects("${KERNELDIR}/${SSYMMDIRECTKERNEL_ALPHA_BETA}" "" "symm_direct_alpha_betaLU" false "" "" false SINGLE) + GenerateNamedObjects("${KERNELDIR}/${SSYMMDIRECTKERNEL_ALPHA_BETA}" "" "symm_direct_alpha_betaLL" false "" "" false SINGLE) + endif () + endif() + + if (USE_DIRECT_STRMM) + if (ARM64) + set (STRMMDIRECTKERNEL strmm_direct_arm64_sme1.c) + GenerateNamedObjects("${KERNELDIR}/${STRMMDIRECTKERNEL}" "" "trmm_direct_LNUN" false "" "" false SINGLE) + GenerateNamedObjects("${KERNELDIR}/${STRMMDIRECTKERNEL}" "" "trmm_direct_LNLN" false "" "" false SINGLE) + GenerateNamedObjects("${KERNELDIR}/${STRMMDIRECTKERNEL}" "" "trmm_direct_LTUN" false "" "" false SINGLE) + GenerateNamedObjects("${KERNELDIR}/${STRMMDIRECTKERNEL}" "" "trmm_direct_LTLN" false "" "" false SINGLE) + endif () + endif () + + if (USE_DIRECT_SSYRK) + if (ARM64) + set (SSYRKDIRECTKERNEL_ALPHA_BETA ssyrk_direct_alpha_beta_arm64_sme1.c) + GenerateNamedObjects("${KERNELDIR}/${SSYRKDIRECTKERNEL_ALPHA_BETA}" "" "syrk_direct_alpha_betaUN" false "" "" false SINGLE) + GenerateNamedObjects("${KERNELDIR}/${SSYRKDIRECTKERNEL_ALPHA_BETA}" "" "syrk_direct_alpha_betaUT" false "" "" false SINGLE) + GenerateNamedObjects("${KERNELDIR}/${SSYRKDIRECTKERNEL_ALPHA_BETA}" "" "syrk_direct_alpha_betaLN" false "" "" false SINGLE) + GenerateNamedObjects("${KERNELDIR}/${SSYRKDIRECTKERNEL_ALPHA_BETA}" "" "syrk_direct_alpha_betaLT" false "" "" false SINGLE) + endif () + endif() + + if (USE_DIRECT_SSYR2K) + if (ARM64) + set (SSYR2KDIRECTKERNEL_ALPHA_BETA ssyr2k_direct_alpha_beta_arm64_sme1.c) + GenerateNamedObjects("${KERNELDIR}/${SSYR2KDIRECTKERNEL_ALPHA_BETA}" "" "syr2k_direct_alpha_betaUN" false "" "" false SINGLE) + GenerateNamedObjects("${KERNELDIR}/${SSYR2KDIRECTKERNEL_ALPHA_BETA}" "" "syr2k_direct_alpha_betaUT" false "" "" false SINGLE) + GenerateNamedObjects("${KERNELDIR}/${SSYR2KDIRECTKERNEL_ALPHA_BETA}" "" "syr2k_direct_alpha_betaLN" false "" "" false SINGLE) + GenerateNamedObjects("${KERNELDIR}/${SSYR2KDIRECTKERNEL_ALPHA_BETA}" "" "syr2k_direct_alpha_betaLT" false "" "" false SINGLE) + endif () + endif() + foreach (float_type SINGLE DOUBLE) string(SUBSTRING ${float_type} 0 1 float_char) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) @@ -336,6 +431,20 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) endif () if (BUILD_BFLOAT16) + if (BGEMMINCOPY) + GenerateNamedObjects("${KERNELDIR}/${BGEMMINCOPY}" "BGEMM" "${BGEMMINCOPYOBJ}" false "" "" true "BFLOAT16") + endif () + if (BGEMMITCOPY) + GenerateNamedObjects("${KERNELDIR}/${BGEMMITCOPY}" "BGEMM" "${BGEMMITCOPYOBJ}" false "" "" true "BFLOAT16") + endif () + if (BGEMMONCOPY) + GenerateNamedObjects("${KERNELDIR}/${BGEMMONCOPY}" "BGEMM" "${BGEMMONCOPYOBJ}" false "" "" true "BFLOAT16") + endif () + if (BGEMMOTCOPY) + GenerateNamedObjects("${KERNELDIR}/${BGEMMOTCOPY}" "BGEMM" "${BGEMMOTCOPYOBJ}" false "" "" true "BFLOAT16") + endif () + GenerateNamedObjects("${KERNELDIR}/${BGEMMKERNEL}" "BGEMM" "gemm_kernel" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${BGEMM_BETA}" "BGEMM" "gemm_beta" false "" "" false "BFLOAT16") if (SBGEMMINCOPY) GenerateNamedObjects("${KERNELDIR}/${SBGEMMINCOPY}" "" "${SBGEMMINCOPYOBJ}" false "" "" true "BFLOAT16") endif () @@ -351,6 +460,22 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${SBGEMMKERNEL}" "" "gemm_kernel" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${SBGEMM_BETA}" "" "gemm_beta" false "" "" false "BFLOAT16") endif () + if (BUILD_HFLOAT16) + if (SHGEMMINCOPY) + GenerateNamedObjects("${KERNELDIR}/${SHGEMMINCOPY}" "" "${SHGEMMINCOPYOBJ}" false "" "" true "HFLOAT16") + endif () + if (SHGEMMITCOPY) + GenerateNamedObjects("${KERNELDIR}/${SHGEMMITCOPY}" "" "${SHGEMMITCOPYOBJ}" false "" "" true "HFLOAT16") + endif () + if (SHGEMMONCOPY) + GenerateNamedObjects("${KERNELDIR}/${SHGEMMONCOPY}" "" "${SHGEMMONCOPYOBJ}" false "" "" true "HFLOAT16") + endif () + if (SHGEMMOTCOPY) + GenerateNamedObjects("${KERNELDIR}/${SHGEMMOTCOPY}" "" "${SHGEMMOTCOPYOBJ}" false "" "" true "HFLOAT16") + endif () + GenerateNamedObjects("${KERNELDIR}/${SHGEMMKERNEL}" "" "gemm_kernel" false "" "" false "HFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SHGEMM_BETA}" "" "gemm_beta" false "" "" false "HFLOAT16") + endif () foreach (float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char) if (${float_char}GEMMINCOPY) @@ -377,6 +502,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) set(TRMM_KERNEL "${${float_char}GEMMKERNEL}") endif () + if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") # just enumerate all these. there is an extra define for these indicating which side is a conjugate (e.g. CN NC NN) that I don't really want to work into GenerateCombinationObjects @@ -769,6 +895,45 @@ endif () GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16") endif () + + if (BUILD_HFLOAT16) + if (NOT DEFINED SHGEMM_SMALL_M_PERMIT) + set(SHGEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c) + endif () + if (NOT DEFINED SHGEMM_SMALL_K_NN) + set(SHGEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c) + endif () + if (NOT DEFINED SHGEMM_SMALL_K_NT) + set(SHGEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c) + endif () + if (NOT DEFINED SHGEMM_SMALL_K_TN) + set(SHGEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c) + endif () + if (NOT DEFINED SHGEMM_SMALL_K_TT) + set(SHGEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c) + endif () + if (NOT DEFINED SHGEMM_SMALL_K_B0_NN) + set(SHGEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c) + endif () + if (NOT DEFINED SHGEMM_SMALL_K_B0_NT) + set(SHGEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c) + endif () + if (NOT DEFINED SHGEMM_SMALL_K_B0_TN) + set(SHGEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c) + endif () + if (NOT DEFINED SHGEMM_SMALL_K_B0_TT) + set(SHGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c) + endif () + GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false "HFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "HFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false "HFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false "HFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_TT}" "" "gemm_small_kernel_tt" false "" "" false "HFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false "HFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false "HFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "HFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "HFLOAT16") + endif () endif () if (NOT DEFINED ${float_char}OMATCOPY_CN) diff --git a/kernel/Makefile b/kernel/Makefile index 84cd482a06..014908c7eb 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -27,7 +27,29 @@ endif ifdef TARGET_CORE ifeq ($(TARGET_CORE), ARMV9SME) - override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -DHAVE_SME -march=armv9-a+sve2+sme + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=armv9-a+sve2+sme + ifdef OS_WINDOWS + ifeq ($(C_COMPILER), CLANG) + override CFLAGS += --aarch64-stack-hazard-size=0 + endif + endif +endif +ifeq ($(TARGET_CORE), VORTEXM4) + ifeq ($(C_COMPILER), GCC) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -UHAVE_SME -march=armv8.4-a + else + ifeq ($(APPLECLANG),1) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=armv8.4-a+sme + else + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=armv8.4-a+sme + override LDFLAGS += -lclang_rt_builtins-aarch64 + endif + ifdef OS_WINDOWS + ifeq ($(C_COMPILER), CLANG) + override CFLAGS += --aarch64-stack-hazard-size=0 + endif + endif + endif endif ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1 index 0fc6720944..221cc51273 100644 --- a/kernel/Makefile.L1 +++ b/kernel/Makefile.L1 @@ -1,3 +1,31 @@ +############################################################################### +# Copyright (c) 2025 The OpenBLAS Project +# All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# 3. Neither the name of the OpenBLAS project nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +############################################################################### + FMAFLAG= ifndef OLDGCC ifdef HAVE_FMA3 @@ -271,6 +299,10 @@ XDOTKERNEL = zdot.S endif ifeq ($(BUILD_BFLOAT16),1) +ifndef BSCALKERNEL +BSCALKERNEL = ../generic/scal.c +endif + ifndef SBDOTKERNEL SBDOTKERNEL = ../x86_64/sbdot.c endif @@ -551,6 +583,8 @@ XBLASOBJS += \ xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) xsum_k$(TSUFFIX).$(SUFFIX) ifeq ($(BUILD_BFLOAT16),1) +BBLASOBJS += \ + bscal_k$(TSUFFIX).$(SUFFIX) SBBLASOBJS += \ sbdot_k$(TSUFFIX).$(SUFFIX) SBEXTOBJS += \ @@ -778,6 +812,8 @@ $(KDIR)qdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNEL $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ ifeq ($(BUILD_BFLOAT16),1) +$(KDIR)bscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)bscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(BSCALKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ $(KDIR)sbdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sbdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBDOTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@ $(KDIR)sbstobf16_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TOBF16KERNEL) diff --git a/kernel/Makefile.L2 b/kernel/Makefile.L2 index 0332ba722e..aea0c9cbb4 100644 --- a/kernel/Makefile.L2 +++ b/kernel/Makefile.L2 @@ -1,3 +1,31 @@ +############################################################################### +# Copyright (c) 2025 The OpenBLAS Project +# All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# 3. Neither the name of the OpenBLAS project nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +############################################################################### + FMAFLAG= ifndef OLDGCC ifdef HAVE_FMA3 @@ -56,6 +84,14 @@ XGEMVTKERNEL = zgemv_t.S endif ifeq ($(BUILD_BFLOAT16),1) +ifndef BGEMVNKERNEL +BGEMVNKERNEL = ../generic/gemv_n.c +endif + +ifndef BGEMVTKERNEL +BGEMVTKERNEL = ../generic/gemv_t.c +endif + ifndef SBGEMVNKERNEL SBGEMVNKERNEL = ../x86_64/sbgemv_n.c endif @@ -65,6 +101,16 @@ SBGEMVTKERNEL = ../x86_64/sbgemv_t.c endif endif +ifeq ($(BUILD_HFLOAT16),1) +ifndef SHGEMVNKERNEL +SHGEMVNKERNEL = ../generic/gemv_n.c +endif + +ifndef SHGEMVTKERNEL +SHGEMVTKERNEL = ../generic/gemv_t.c +endif +endif + ### GER ### ifndef SGERKERNEL @@ -255,11 +301,20 @@ XBLASOBJS += \ xgeru_k$(TSUFFIX).$(SUFFIX) xgerc_k$(TSUFFIX).$(SUFFIX) xgerv_k$(TSUFFIX).$(SUFFIX) xgerd_k$(TSUFFIX).$(SUFFIX) ifeq ($(BUILD_BFLOAT16),1) +BBLASOBJS += \ + bgemv_n$(TSUFFIX).$(SUFFIX) \ + bgemv_t$(TSUFFIX).$(SUFFIX) SBBLASOBJS += \ sbgemv_n$(TSUFFIX).$(SUFFIX) \ sbgemv_t$(TSUFFIX).$(SUFFIX) endif +ifeq ($(BUILD_HFLOAT16),1) +SHBLASOBJS += \ + shgemv_n$(TSUFFIX).$(SUFFIX) \ + shgemv_t$(TSUFFIX).$(SUFFIX) +endif + ifneq "$(or $(BUILD_SINGLE), $(BUILD_DOUBLE), $(BUILD_COMPLEX))" "" $(KDIR)sgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)sgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -UTRANS $< -o $@ @@ -513,5 +568,16 @@ $(KDIR)sbgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)sbgemv_n$(TPSUFFIX).$(PSUFFIX) : $(KE $(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@ $(KDIR)sbgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)sbgemv_t$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMVTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@ +$(KDIR)bgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)bgemv_n$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(BGEMVNKERNEL) + $(CC) -c $(CFLAGS) -DBGEMM -UCOMPLEX $< -o $@ +$(KDIR)bgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)bgemv_t$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(BGEMVTKERNEL) + $(CC) -c $(CFLAGS) -DBGEMM -UCOMPLEX $< -o $@ +endif + +ifeq ($(BUILD_HFLOAT16),1) +$(KDIR)shgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)shgemv_n$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMVNKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@ +$(KDIR)shgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)shgemv_t$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMVTKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@ endif diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 2bd6b294fb..64a87ee25e 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -1,3 +1,30 @@ +############################################################################### +# Copyright (c) 2025, The OpenBLAS Project +# All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# 3. Neither the name of the OpenBLAS project nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +############################################################################### USE_GEMM3M = 0 OS := $(shell uname) @@ -25,6 +52,18 @@ endif ifeq ($(ARCH), arm64) USE_TRMM = 1 USE_DIRECT_SGEMM = 1 +USE_DIRECT_SSYMM = 1 +USE_DIRECT_SSYRK = 1 +USE_DIRECT_SSYR2K = 1 +USE_DIRECT_STRMM = 1 +ifeq ($(CORE), ARMV9SME) +USE_SME = 1 +endif +ifeq ($(CORE), VORTEXM4) +ifneq ($(C_COMPILER), GCC) +USE_SME = 1 +endif +endif endif ifeq ($(ARCH), riscv64) @@ -101,15 +140,66 @@ SGEMMDIRECTKERNEL = sgemm_direct_skylakex.c SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c endif ifeq ($(ARCH), arm64) +SGEMMDIRECTKERNEL = sgemm_direct_arm64_sme1.c +SGEMMDIRECTKERNEL_ALPHA_BETA = sgemm_direct_alpha_beta_arm64_sme1.c +SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c +endif +endif +endif + +ifdef USE_DIRECT_SSYMM +ifndef SSYMMDIRECTKERNEL_ALPHA_BETA +ifeq ($(ARCH), arm64) +SSYMMDIRECTKERNEL_ALPHA_BETA = ssymm_direct_alpha_beta_arm64_sme1.c +endif +endif +endif + +ifdef USE_DIRECT_STRMM +ifndef STRMMDIRECTKERNEL +ifeq ($(ARCH), arm64) +STRMMDIRECTKERNEL = strmm_direct_arm64_sme1.c +endif +endif +endif + +ifdef USE_DIRECT_SSYRK +ifndef SSYRKDIRECTKERNEL_ALPHA_BETA +ifeq ($(ARCH), arm64) +SSYRKDIRECTKERNEL_ALPHA_BETA = ssyrk_direct_alpha_beta_arm64_sme1.c +endif +endif +endif + +ifdef USE_DIRECT_SSYR2K +ifndef SSYR2KDIRECTKERNEL_ALPHA_BETA +ifeq ($(ARCH), arm64) ifeq ($(TARGET_CORE), ARMV9SME) HAVE_SME = 1 endif -SGEMMDIRECTKERNEL = sgemm_direct_arm64_sme1.c +SSYR2KDIRECTKERNEL_ALPHA_BETA = ssyr2k_direct_alpha_beta_arm64_sme1.c endif endif endif ifeq ($(BUILD_BFLOAT16), 1) +ifndef BGEMMKERNEL +BGEMM_BETA = ../generic/gemm_beta.c +BGEMMKERNEL = ../generic/gemmkernel_2x2.c +BGEMMINCOPY = ../generic/gemm_ncopy_2.c +BGEMMITCOPY = ../generic/gemm_tcopy_2.c +BGEMMONCOPY = ../generic/gemm_ncopy_2.c +BGEMMOTCOPY = ../generic/gemm_tcopy_2.c +BGEMMINCOPYOBJ = bgemm_incopy$(TSUFFIX).$(SUFFIX) +BGEMMITCOPYOBJ = bgemm_itcopy$(TSUFFIX).$(SUFFIX) +BGEMMONCOPYOBJ = bgemm_oncopy$(TSUFFIX).$(SUFFIX) +BGEMMOTCOPYOBJ = bgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif +BKERNELOBJS += \ + bgemm_kernel$(TSUFFIX).$(SUFFIX) \ + $(BGEMMINCOPYOBJ) $(BGEMMITCOPYOBJ) \ + $(BGEMMONCOPYOBJ) $(BGEMMOTCOPYOBJ) + ifndef SBGEMMKERNEL SBGEMM_BETA = ../generic/gemm_beta.c SBGEMMKERNEL = ../generic/gemmkernel_2x2.c @@ -129,6 +219,26 @@ SBKERNELOBJS += \ $(SBGEMMONCOPYOBJ) $(SBGEMMOTCOPYOBJ) endif +ifeq ($(BUILD_HFLOAT16), 1) +ifndef SHGEMMKERNEL +SHGEMM_BETA = ../generic/gemm_beta.c +SHGEMMKERNEL = ../generic/gemmkernel_2x2.c +SHGEMMONCOPY = ../generic/gemm_ncopy_2.c +SHGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) +SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) +SHGEMMINCOPY = ../generic/gemm_ncopy_2.c +SHGEMMITCOPY = ../generic/gemm_tcopy_2.c +SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) +SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +SHKERNELOBJS += \ + shgemm_kernel$(TSUFFIX).$(SUFFIX) \ + $(SHGEMMINCOPYOBJ) $(SHGEMMITCOPYOBJ) \ + $(SHGEMMONCOPYOBJ) $(SHGEMMOTCOPYOBJ) +endif + ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" "" SKERNELOBJS += \ sgemm_kernel$(TSUFFIX).$(SUFFIX) \ @@ -144,16 +254,52 @@ SKERNELOBJS += \ endif ifeq ($(ARCH), arm64) SKERNELOBJS += \ - sgemm_direct$(TSUFFIX).$(SUFFIX) -ifdef HAVE_SME + sgemm_direct_performant$(TSUFFIX).$(SUFFIX) \ + sgemm_direct$(TSUFFIX).$(SUFFIX) \ + sgemm_direct_alpha_beta$(TSUFFIX).$(SUFFIX) +ifdef USE_SME SKERNELOBJS += \ - sgemm_direct_sme1$(TSUFFIX).$(SUFFIX) \ + sgemm_direct_sme1_2VLx2VL$(TSUFFIX).$(SUFFIX) \ sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX) endif endif endif endif +ifdef USE_DIRECT_SSYMM +ifeq ($(ARCH), arm64) +SKERNELOBJS += \ + ssymm_direct_alpha_betaLU$(TSUFFIX).$(SUFFIX) \ + ssymm_direct_alpha_betaLL$(TSUFFIX).$(SUFFIX) +endif +endif + +ifdef USE_DIRECT_STRMM +ifeq ($(ARCH), arm64) +SKERNELOBJS += \ + strmm_direct_LNUN$(TSUFFIX).$(SUFFIX) strmm_direct_LNLN$(TSUFFIX).$(SUFFIX) \ + strmm_direct_LTUN$(TSUFFIX).$(SUFFIX) strmm_direct_LTLN$(TSUFFIX).$(SUFFIX) +endif +endif + +ifdef USE_DIRECT_SSYRK +ifeq ($(ARCH), arm64) +SKERNELOBJS += \ + ssyrk_direct_alpha_betaUN$(TSUFFIX).$(SUFFIX) ssyrk_direct_alpha_betaUT$(TSUFFIX).$(SUFFIX) \ + ssyrk_direct_alpha_betaLN$(TSUFFIX).$(SUFFIX) ssyrk_direct_alpha_betaLT$(TSUFFIX).$(SUFFIX) +endif +endif + +ifdef USE_DIRECT_SSYR2K +ifeq ($(ARCH), arm64) +SKERNELOBJS += \ + ssyr2k_direct_alpha_betaUN$(TSUFFIX).$(SUFFIX) ssyr2k_direct_alpha_betaUN$(TSUFFIX).$(SUFFIX) \ + ssyr2k_direct_alpha_betaUT$(TSUFFIX).$(SUFFIX) ssyr2k_direct_alpha_betaUT$(TSUFFIX).$(SUFFIX) \ + ssyr2k_direct_alpha_betaLN$(TSUFFIX).$(SUFFIX) ssyr2k_direct_alpha_betaLN$(TSUFFIX).$(SUFFIX) \ + ssyr2k_direct_alpha_betaLT$(TSUFFIX).$(SUFFIX) ssyr2k_direct_alpha_betaLT$(TSUFFIX).$(SUFFIX) +endif +endif + ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" DKERNELOBJS += \ dgemm_beta$(TSUFFIX).$(SUFFIX) \ @@ -190,8 +336,12 @@ XKERNELOBJS += \ $(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ) ifeq ($(BUILD_BFLOAT16),1) +BBLASOBJS += $(BKERNELOBJS) SBBLASOBJS += $(SBKERNELOBJS) endif +ifeq ($(BUILD_HFLOAT16),1) +SHBLASOBJS += $(SHKERNELOBJS) +endif SBLASOBJS += $(SKERNELOBJS) DBLASOBJS += $(DKERNELOBJS) QBLASOBJS += $(QKERNELOBJS) @@ -200,8 +350,12 @@ ZBLASOBJS += $(ZKERNELOBJS) XBLASOBJS += $(XKERNELOBJS) ifeq ($(BUILD_BFLOAT16),1) +BBLASOBJS += bgemm_beta$(TSUFFIX).$(SUFFIX) SBBLASOBJS += sbgemm_beta$(TSUFFIX).$(SUFFIX) endif +ifeq ($(BUILD_HFLOAT16),1) +SHBLASOBJS += shgemm_beta$(TSUFFIX).$(SUFFIX) +endif ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" "" SBLASOBJS += \ @@ -493,6 +647,15 @@ SBBLASOBJS += \ sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) endif +ifeq ($(BUILD_HFLOAT16),1) +SHBLASOBJS += \ + shgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ + shgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) shgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ + shgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) shgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ + shgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) shgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ + shgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) shgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) +endif + SBLASOBJS += \ sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ @@ -593,12 +756,23 @@ ZBLASOBJS += \ endif ifeq ($(BUILD_BFLOAT16), 1) +BGEMMINCOPYOBJ_P = $(BGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +BGEMMITCOPYOBJ_P = $(BGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +BGEMMONCOPYOBJ_P = $(BGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +BGEMMOTCOPYOBJ_P = $(BGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SBGEMMINCOPYOBJ_P = $(SBGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SBGEMMITCOPYOBJ_P = $(SBGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SBGEMMONCOPYOBJ_P = $(SBGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SBGEMMOTCOPYOBJ_P = $(SBGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) endif +ifeq ($(BUILD_HFLOAT16), 1) +SHGEMMINCOPYOBJ_P = $(SHGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +SHGEMMITCOPYOBJ_P = $(SHGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +SHGEMMONCOPYOBJ_P = $(SHGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +SHGEMMOTCOPYOBJ_P = $(SHGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +endif + SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SGEMMITCOPYOBJ_P = $(SGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SGEMMONCOPYOBJ_P = $(SGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) @@ -625,10 +799,17 @@ XGEMMONCOPYOBJ_P = $(XGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) XGEMMOTCOPYOBJ_P = $(XGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) ifeq ($(BUILD_BFLOAT16),1) +$(KDIR)bgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(BGEMM_BETA) + $(CC) $(CFLAGS) -c -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)sbgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA) $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ endif +ifeq ($(BUILD_HFLOAT16),1) +$(KDIR)shgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA) + $(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ +endif + $(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ @@ -651,9 +832,22 @@ ifeq ($(ARCH), E2K) USE_TRMM = 1 endif - ifeq ($(BUILD_BFLOAT16), 1) +$(KDIR)$(BGEMMONCOPYOBJ) : $(KERNELDIR)/$(BGEMMONCOPY) + $(CC) $(CFLAGS) -c -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(BGEMMOTCOPYOBJ) : $(KERNELDIR)/$(BGEMMOTCOPY) + $(CC) $(CFLAGS) -c -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(BGEMM_UNROLL_M), $(BGEMM_UNROLL_N)) +$(KDIR)$(BGEMMINCOPYOBJ) : $(KERNELDIR)/$(BGEMMINCOPY) + $(CC) $(CFLAGS) -c -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(BGEMMITCOPYOBJ) : $(KERNELDIR)/$(BGEMMITCOPY) + $(CC) $(CFLAGS) -c -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX $< -o $@ +endif + $(KDIR)$(SBGEMMONCOPYOBJ) : $(KERNELDIR)/$(SBGEMMONCOPY) $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ @@ -671,6 +865,25 @@ $(KDIR)$(SBGEMMITCOPYOBJ) : $(KERNELDIR)/$(SBGEMMITCOPY) endif endif +ifeq ($(BUILD_HFLOAT16), 1) + +$(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY) + $(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY) + $(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +#ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) + +$(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY) + $(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY) + $(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +#endif +endif + $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ @@ -836,23 +1049,59 @@ $(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ endif ifeq ($(ARCH), arm64) +$(KDIR)sgemm_direct_performant$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTPERFORMANT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -ifdef HAVE_SME -$(KDIR)sgemm_direct_sme1$(TSUFFIX).$(SUFFIX) : - $(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1.S -UDOUBLE -UCOMPLEX -o $@ +$(KDIR)sgemm_direct_alpha_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL_ALPHA_BETA) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +ifdef USE_SME +$(KDIR)sgemm_direct_sme1_2VLx2VL$(TSUFFIX).$(SUFFIX) : + $(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1_2VLx2VL.S -UDOUBLE -UCOMPLEX -o $@ $(KDIR)sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX) : $(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1_preprocess.S -UDOUBLE -UCOMPLEX -o $@ endif endif endif -ifeq ($(BUILD_BFLOAT16), 1) +ifdef USE_DIRECT_SSYMM +ifeq ($(ARCH), arm64) +$(KDIR)ssymm_direct_alpha_betaLU$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMDIRECTKERNEL_ALPHA_BETA) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DLEFT -DUPPER $< -o $@ +$(KDIR)ssymm_direct_alpha_betaLL$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMDIRECTKERNEL_ALPHA_BETA) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DLEFT -DLOWER $< -o $@ +endif +endif + +ifdef USE_DIRECT_SSYRK +ifeq ($(ARCH), arm64) +$(KDIR)ssyrk_direct_alpha_betaUN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYRKDIRECTKERNEL_ALPHA_BETA) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DUPPER -UTRANSA $< -o $@ + +$(KDIR)ssyrk_direct_alpha_betaUT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYRKDIRECTKERNEL_ALPHA_BETA) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DUPPER -DTRANSA $< -o $@ + +$(KDIR)ssyrk_direct_alpha_betaLN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYRKDIRECTKERNEL_ALPHA_BETA) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UUPPER -UTRANSA $< -o $@ +$(KDIR)ssyrk_direct_alpha_betaLT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYRKDIRECTKERNEL_ALPHA_BETA) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UUPPER -DTRANSA $< -o $@ +endif +endif + +ifeq ($(BUILD_BFLOAT16), 1) +$(KDIR)bgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(BGEMMKERNEL) + $(CC) $(CFLAGS) -c -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)sbgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND) $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ endif +ifeq ($(BUILD_HFLOAT16), 1) + +$(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) + $(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ +endif + $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_kernel$(TSUFFIX).s @@ -967,6 +1216,20 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ +ifdef USE_DIRECT_SSYR2K +ifeq ($(ARCH), arm64) +$(KDIR)ssyr2k_direct_alpha_betaUN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYR2KDIRECTKERNEL_ALPHA_BETA) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DUPPER -UTRANSA $< -o $@ +$(KDIR)ssyr2k_direct_alpha_betaUT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYR2KDIRECTKERNEL_ALPHA_BETA) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DUPPER -DTRANSA $< -o $@ +$(KDIR)ssyr2k_direct_alpha_betaLN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYR2KDIRECTKERNEL_ALPHA_BETA) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UUPPER -UTRANSA $< -o $@ +$(KDIR)ssyr2k_direct_alpha_betaLT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYR2KDIRECTKERNEL_ALPHA_BETA) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UUPPER -DTRANSA $< -o $@ + +endif +endif + ifdef USE_TRMM $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) ifeq ($(OS), AIX) @@ -1008,6 +1271,23 @@ else $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ endif + +ifdef USE_DIRECT_STRMM +ifeq ($(ARCH), arm64) +$(KDIR)strmm_direct_LNUN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMDIRECTKERNEL) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UTRANSA -DUPPER $< -o $@ + +$(KDIR)strmm_direct_LNLN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMDIRECTKERNEL) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UTRANSA -UUPPER $< -o $@ + +$(KDIR)strmm_direct_LTUN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMDIRECTKERNEL) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DTRANSA -DUPPER $< -o $@ + +$(KDIR)strmm_direct_LTLN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMDIRECTKERNEL) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DTRANSA -UUPPER $< -o $@ +endif +endif + $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > dtrmm_kernel_ln.s @@ -2836,10 +3116,17 @@ $(KDIR)sgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ ifeq ($(BUILD_BFLOAT16),1) +$(KDIR)bgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(BGEMM_BETA) + $(CC) $(PFLAGS) -c -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)sbgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA) $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ endif +ifeq ($(BUILD_HFLOAT16),1) +$(KDIR)shgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA) + $(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ +endif + $(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA) $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ @@ -2857,6 +3144,20 @@ $(KDIR)xgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) ifeq ($(BUILD_BFLOAT16), 1) +$(BGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(BGEMMONCOPY) + $(CC) $(PFLAGS) -c -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX $< -o $@ + +$(BGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(BGEMMOTCOPY) + $(CC) $(PFLAGS) -c -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(BGEMM_UNROLL_M), $(BGEMM_UNROLL_N)) +$(BGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(BGEMMINCOPY) + $(CC) $(PFLAGS) -c -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX $< -o $@ + +$(BGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(BGEMMITCOPY) + $(CC) $(PFLAGS) -c -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX $< -o $@ +endif + $(SBGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMONCOPY) $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ @@ -2869,10 +3170,26 @@ $(SBGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMINCOPY) $(SBGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMITCOPY) $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ - endif endif +ifeq ($(BUILD_HFLOAT16), 1) +$(SHGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMONCOPY) + $(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(SHGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMOTCOPY) + $(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +#ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) +$(SHGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMINCOPY) + $(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(SHGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMITCOPY) + $(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +#endif +endif + $(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ @@ -2979,10 +3296,17 @@ endif ifeq ($(BUILD_BFLOAT16), 1) +$(KDIR)bgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(BGEMMKERNEL) $(BGEMMDEPEND) + $(CC) $(PFLAGS) -c -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)sbgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND) $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ endif +ifeq ($(BUILD_HFLOAT16), 1) +$(KDIR)shgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) + $(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ +endif + $(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ @@ -4843,6 +5167,71 @@ $(KDIR)sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMA $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ endif +ifeq ($(BUILD_HFLOAT16), 1) +ifndef SHGEMM_SMALL_M_PERMIT +SHGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c +endif + +ifndef SHGEMM_SMALL_K_NN +SHGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c +endif + +ifndef SHGEMM_SMALL_K_NT +SHGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c +endif + +ifndef SHGEMM_SMALL_K_TN +SHGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c +endif + +ifndef SHGEMM_SMALL_K_TT +SHGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c +endif + +$(KDIR)shgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_M_PERMIT) + $(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)shgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)shgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)shgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)shgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +ifndef SHGEMM_SMALL_K_B0_NN +SHGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c +endif + +ifndef SHGEMM_SMALL_K_B0_NT +SHGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c +endif + +ifndef SHGEMM_SMALL_K_B0_TN +SHGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c +endif + +ifndef SHGEMM_SMALL_K_B0_TT +SHGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c +endif + +$(KDIR)shgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ + +$(KDIR)shgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ + +$(KDIR)shgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ + +$(KDIR)shgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ +endif + ifndef CGEMM_SMALL_M_PERMIT CGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c endif diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index 3d68271da4..6c1da89532 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -4,3 +4,12 @@ SGEMVNKERNEL = gemv_n_sve_v4x3.c DGEMVNKERNEL = gemv_n_sve_v4x3.c SGEMVTKERNEL = gemv_t_sve_v4x3.c DGEMVTKERNEL = gemv_t_sve_v4x3.c + +DDOTKERNEL = dot.c +SDOTKERNEL = dot.c + +SAXPYKERNEL = axpy_sve.c +DAXPYKERNEL = axpy_sve.c + +SGERKERNEL = ger_sve_v1x3.c +DGERKERNEL = ger_sve_v1x3.c diff --git a/kernel/arm64/KERNEL.AMPERE1 b/kernel/arm64/KERNEL.AMPERE1 new file mode 100644 index 0000000000..46a34469c3 --- /dev/null +++ b/kernel/arm64/KERNEL.AMPERE1 @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.NEOVERSEN1 diff --git a/kernel/arm64/KERNEL.NEOVERSEN1 b/kernel/arm64/KERNEL.NEOVERSEN1 index b50966f2f4..7afbb59dd9 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN1 +++ b/kernel/arm64/KERNEL.NEOVERSEN1 @@ -102,18 +102,8 @@ ZNRM2KERNEL = znrm2.S DDOTKERNEL = dot.c SDOTKERNEL = dot.c -ifeq ($(OSNAME), WINNT) -ifeq ($(C_COMPILER), CLANG) -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -else -CDOTKERNEL = zdot_thunderx2t99.c -ZDOTKERNEL = zdot_thunderx2t99.c -endif -else CDOTKERNEL = zdot_thunderx2t99.c ZDOTKERNEL = zdot_thunderx2t99.c -endif DSDOTKERNEL = dot.S DGEMM_BETA = dgemm_beta.S diff --git a/kernel/arm64/KERNEL.NEOVERSEN2 b/kernel/arm64/KERNEL.NEOVERSEN2 index c8d511f209..6431422faa 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN2 +++ b/kernel/arm64/KERNEL.NEOVERSEN2 @@ -188,6 +188,20 @@ ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +ifeq ($(BUILD_BFLOAT16), 1) +BGEMM_BETA = bgemm_beta_neon.c +BGEMMKERNEL = sbgemm_kernel_$(BGEMM_UNROLL_M)x$(BGEMM_UNROLL_N)_neoversen2.c +BGEMMINCOPY = sbgemm_ncopy_$(BGEMM_UNROLL_M)_neoversen2.c +BGEMMITCOPY = sbgemm_tcopy_$(BGEMM_UNROLL_M)_neoversen2.c +BGEMMONCOPY = sbgemm_ncopy_$(BGEMM_UNROLL_N)_neoversen2.c +BGEMMOTCOPY = sbgemm_tcopy_$(BGEMM_UNROLL_N)_neoversen2.c +BGEMMINCOPYOBJ = bgemm_incopy$(TSUFFIX).$(SUFFIX) +BGEMMITCOPYOBJ = bgemm_itcopy$(TSUFFIX).$(SUFFIX) +BGEMMONCOPYOBJ = bgemm_oncopy$(TSUFFIX).$(SUFFIX) +BGEMMOTCOPYOBJ = bgemm_otcopy$(TSUFFIX).$(SUFFIX) +BGEMVTKERNEL = sbgemv_t_bfdot.c +BGEMVNKERNEL = bgemv_n_sve_v3x4.c + SBGEMM_BETA = sbgemm_beta_neoversen2.c SBGEMMKERNEL = sbgemm_kernel_$(SBGEMM_UNROLL_M)x$(SBGEMM_UNROLL_N)_neoversen2.c SBGEMMINCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_M)_neoversen2.c @@ -199,4 +213,5 @@ SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) SBGEMVTKERNEL = sbgemv_t_bfdot.c -SBGEMVNKERNEL = sbgemv_n_neon.c \ No newline at end of file +SBGEMVNKERNEL = sbgemv_n_neon.c +endif diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1 index 3e622bcbfb..db80458290 100644 --- a/kernel/arm64/KERNEL.NEOVERSEV1 +++ b/kernel/arm64/KERNEL.NEOVERSEV1 @@ -1,24 +1,75 @@ +############################################################################### +# Copyright (c) 2025, The OpenBLAS Project +# All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# 3. Neither the name of the OpenBLAS project nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +############################################################################### + include $(KERNELDIR)/KERNEL.ARMV8SVE SGEMVNKERNEL = gemv_n_sve_v1x3.c DGEMVNKERNEL = gemv_n_sve_v1x3.c SGEMVTKERNEL = gemv_t_sve_v1x3.c DGEMVTKERNEL = gemv_t_sve_v1x3.c + +SAXPYKERNEL = axpy_sve.c +DAXPYKERNEL = axpy_sve.c + ifeq ($(BUILD_BFLOAT16), 1) +BGEMM_BETA = bgemm_beta_neon.c +BGEMMKERNEL = bgemm_kernel_2vlx4_neoversev1.c +ifneq ($(BGEMM_UNROLL_M), $(BGEMM_UNROLL_N)) +BGEMMINCOPY = bgemm_ncopy_2vl_neoversev1.c +BGEMMITCOPY = bgemm_tcopy_2vl_neoversev1.c +BGEMMINCOPYOBJ = bgemm_incopy$(TSUFFIX).$(SUFFIX) +BGEMMITCOPYOBJ = bgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +BGEMMONCOPY = bgemm_ncopy_4_neoversev1.c +BGEMMOTCOPY = bgemm_tcopy_4_neoversev1.c +BGEMMONCOPYOBJ = bgemm_oncopy$(TSUFFIX).$(SUFFIX) +BGEMMOTCOPYOBJ = bgemm_otcopy$(TSUFFIX).$(SUFFIX) + +BGEMVTKERNEL = sbgemv_t_bfdot.c +BGEMVNKERNEL = bgemv_n_sve_v3x4.c + SBGEMM_BETA = sbgemm_beta_neoversev1.c -SBGEMMKERNEL = sbgemm_kernel_$(SBGEMM_UNROLL_M)x$(SBGEMM_UNROLL_N)_neoversev1.c +SBGEMMKERNEL = bgemm_kernel_2vlx4_neoversev1.c ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N)) -SBGEMMINCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_M)_neoversev1.c -SBGEMMITCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_M)_neoversev1.c +SBGEMMINCOPY = bgemm_ncopy_2vl_neoversev1.c +SBGEMMITCOPY = bgemm_tcopy_2vl_neoversev1.c SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) endif -SBGEMMONCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversev1.c -SBGEMMOTCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_N)_neoversev1.c +SBGEMMONCOPY = bgemm_ncopy_4_neoversev1.c +SBGEMMOTCOPY = bgemm_tcopy_4_neoversev1.c SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) SBGEMVNKERNEL = sbgemv_n_neon.c SBGEMVTKERNEL = sbgemv_t_bfdot.c -endif \ No newline at end of file +endif + +SGERKERNEL = ger_sve_v1x3.c +DGERKERNEL = ger_sve_v1x3.c diff --git a/kernel/arm64/KERNEL.NEOVERSEV2 b/kernel/arm64/KERNEL.NEOVERSEV2 index e08efdb9d8..8f58833cdb 100644 --- a/kernel/arm64/KERNEL.NEOVERSEV2 +++ b/kernel/arm64/KERNEL.NEOVERSEV2 @@ -1,6 +1 @@ -include $(KERNELDIR)/KERNEL.ARMV8SVE - -ifeq ($(BUILD_BFLOAT16), 1) -SBGEMVTKERNEL = sbgemv_t_bfdot.c -SBGEMVNKERNEL = sbgemv_n_neon.c -endif \ No newline at end of file +include $(KERNELDIR)/KERNEL.NEOVERSEN2 diff --git a/kernel/arm64/KERNEL.VORTEXM4 b/kernel/arm64/KERNEL.VORTEXM4 new file mode 100644 index 0000000000..46a34469c3 --- /dev/null +++ b/kernel/arm64/KERNEL.VORTEXM4 @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.NEOVERSEN1 diff --git a/kernel/arm64/axpy_sve.c b/kernel/arm64/axpy_sve.c new file mode 100644 index 0000000000..69a99cfee6 --- /dev/null +++ b/kernel/arm64/axpy_sve.c @@ -0,0 +1,86 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include + +#include "common.h" + +#ifdef DOUBLE +#define SV_TYPE svfloat64_t +#define SV_COUNT svcntd +#define SV_DUP svdup_f64 +#define SV_WHILE svwhilelt_b64_s64 +#define SV_TRUE svptrue_b64 +#else +#define SV_TYPE svfloat32_t +#define SV_COUNT svcntw +#define SV_DUP svdup_f32 +#define SV_WHILE svwhilelt_b32_s64 +#define SV_TRUE svptrue_b32 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + BLASLONG sve_size = SV_COUNT(); + + if (n < 0) return (0); + if (da == 0.0) return (0); + + if (inc_x == 1 && inc_y == 1) { + SV_TYPE da_vec = SV_DUP(da); + for (i = 0; i + sve_size - 1 < n; i += sve_size) { + SV_TYPE x_vec = svld1(SV_TRUE(), &x[i]); + SV_TYPE y_vec = svld1(SV_TRUE(), &y[i]); + y_vec = svmla_x(SV_TRUE(), y_vec, da_vec, x_vec); + svst1(SV_TRUE(), &y[i], y_vec); + } + + if (i < n) { + svbool_t pg = SV_WHILE(i, n); + SV_TYPE x_vec = svld1(pg, &x[i]); + SV_TYPE y_vec = svld1(pg, &y[i]); + y_vec = svmla_x(pg, y_vec, da_vec, x_vec); + svst1(pg, &y[i], y_vec); + } + return (0); + } + + while (i < n) { + y[iy] += da * x[ix]; + ix += inc_x; + iy += inc_y; + i++; + } + + return (0); +} diff --git a/kernel/arm64/bgemm_beta_neon.c b/kernel/arm64/bgemm_beta_neon.c new file mode 100644 index 0000000000..603377f8f0 --- /dev/null +++ b/kernel/arm64/bgemm_beta_neon.c @@ -0,0 +1,107 @@ +/*************************************************************************** + * Copyright (c) 2025, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include "common.h" + +#include + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta_in, IFLOAT *dummy2, + BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5, FLOAT *c, + BLASLONG ldc) { + BLASLONG i, j; + BLASLONG chunk, remain; + + bfloat16_t *ptr_c, *ptr_c0; + + bfloat16x8_t x0, z0; + float32x4_t y0, y1; + + float x; + bfloat16_t z; + + bfloat16_t zero_bf16 = vcvth_bf16_f32(0.0f); + bfloat16x8_t zeros = vdupq_n_bf16(zero_bf16); + + bfloat16_t beta_bf16; + memcpy(&beta_bf16, &beta_in, sizeof(bfloat16_t)); + float beta = vcvtah_f32_bf16(beta_bf16); + float32x4_t beta_neon = vdupq_n_f32(beta); + + ptr_c = (bfloat16_t *)c; + + chunk = m >> 3; + remain = m & 7; + + if (beta == 0.0f){ + for (j = 0; j < n; j ++){ + ptr_c0 = ptr_c; + ptr_c += ldc; + + for (i = 0; i < chunk; i ++){ + vst1q_bf16(ptr_c0, zeros); + ptr_c0 += 8; + } + + for (i = 0; i < remain; i ++){ + ptr_c0[0] = zero_bf16; + ptr_c0 ++; + } + } + } else { + for (j = 0; j < n; j ++){ + ptr_c0 = ptr_c; + ptr_c += ldc; + + for (i = 0; i < chunk; i ++){ + x0 = vld1q_bf16(ptr_c0); + + y0 = vcvtq_low_f32_bf16(x0); + y1 = vcvtq_high_f32_bf16(x0); + + y0 = vmulq_f32(y0, beta_neon); + y1 = vmulq_f32(y1, beta_neon); + + z0 = vcvtq_low_bf16_f32(y0); + z0 = vcvtq_high_bf16_f32(z0, y1); + + vst1q_bf16(ptr_c0, z0); + + ptr_c0 += 8; + } + + for (i = 0; i < remain; i ++){ + x = vcvtah_f32_bf16(ptr_c0[0]); + z = vcvth_bf16_f32(x * beta); + + ptr_c0[0] = z; + ptr_c0 ++; + } + } + } + return 0; +}; diff --git a/kernel/arm64/sbgemm_kernel_4x4_neoversev1.c b/kernel/arm64/bgemm_kernel_2vlx4_neoversev1.c similarity index 76% rename from kernel/arm64/sbgemm_kernel_4x4_neoversev1.c rename to kernel/arm64/bgemm_kernel_2vlx4_neoversev1.c index 889b5fc5b8..8e135da4a5 100644 --- a/kernel/arm64/sbgemm_kernel_4x4_neoversev1.c +++ b/kernel/arm64/bgemm_kernel_2vlx4_neoversev1.c @@ -1,5 +1,5 @@ /*************************************************************************** - * Copyright (c) 2024-2025, The OpenBLAS Project + * Copyright (c) 2025, The OpenBLAS Project * All rights reserved. * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are @@ -27,20 +27,31 @@ * *****************************************************************************/ #include +#include #include "common.h" #define ALPHA_ONE -#include "sbgemm_kernel_4x4_neoversev1_impl.c" +#include "bgemm_kernel_2vlx4_neoversev1_impl.c" #undef ALPHA_ONE -#include "sbgemm_kernel_4x4_neoversev1_impl.c" +#undef UPDATE_C +#undef UPDATE_C2 +#undef UPDATE_C1 +#include "bgemm_kernel_2vlx4_neoversev1_impl.c" int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, BLASLONG ldc) { - if (alpha == 1.0f) - return sbgemm_kernel_neoversev1_alpha_one(m, n, k, alpha, A, B, C, ldc); +#ifdef BGEMM + bfloat16_t alpha_bf16; + memcpy(&alpha_bf16, &alpha, sizeof(bfloat16_t)); + float alpha_f32 = vcvtah_f32_bf16(alpha_bf16); +#else + float alpha_f32 = alpha; +#endif + + if (alpha_f32 == 1.0f) + return bgemm_kernel_neoversev1_alpha_one(m, n, k, alpha_f32, A, B, C, ldc); else - return sbgemm_kernel_neoversev1_alpha(m, n, k, alpha, A, B, C, ldc); + return bgemm_kernel_neoversev1_alpha(m, n, k, alpha_f32, A, B, C, ldc); return 0; } - diff --git a/kernel/arm64/bgemm_kernel_2vlx4_neoversev1_impl.c b/kernel/arm64/bgemm_kernel_2vlx4_neoversev1_impl.c new file mode 100644 index 0000000000..6749c7948f --- /dev/null +++ b/kernel/arm64/bgemm_kernel_2vlx4_neoversev1_impl.c @@ -0,0 +1,440 @@ +/*************************************************************************** + * Copyright (c) 2025, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include +#include + +#include "common.h" + +#ifdef BGEMM + +#ifdef ALPHA_ONE + +#define TO16 vcvth_bf16_f32 +#define TO32 vcvtah_f32_bf16 + +#define UPDATE_C(PG, PTR, DST, SRC) \ + do { \ + svtmp16 = svld1_bf16((pghalf), (PTR)); \ + DST = svreinterpret_f32(svzip1_bf16(zeros, svtmp16)); \ + DST = svadd_z((PG), SRC, DST); \ + svtmp16 = svcvt_bf16_f32_z((PG), DST); \ + svtmp16 = svuzp1_bf16(svtmp16, svtmp16); \ + svst1_bf16((pghalf), (PTR), svtmp16); \ + } while (0); +#define UPDATE_C2(ptr, tmp, vector) \ + *(ptr) = TO16(vector[0] + TO32(*ptr)); \ + *(ptr + 1) = TO16(vector[1] + TO32(*(ptr + 1))); +#define UPDATE_C1(ptr, value) *ptr = TO16(TO32(*ptr) + (value)) + +#else + +#define UPDATE_C(PG, PTR, DST, SRC) \ + do { \ + svtmp16 = svld1_bf16((pghalf), (PTR)); \ + DST = svreinterpret_f32(svzip1_bf16(zeros, svtmp16)); \ + DST = svmad_z((PG), svalpha, SRC, DST); \ + svtmp16 = svcvt_bf16_f32_z((PG), DST); \ + svtmp16 = svuzp1_bf16(svtmp16, svtmp16); \ + svst1_bf16((pghalf), (PTR), svtmp16); \ + } while (0); +#define UPDATE_C2(ptr, tmp, vector) \ + *(ptr) = TO16(vector[0] * alpha + TO32(*ptr)); \ + *(ptr + 1) = TO16(vector[1] * alpha + TO32(*(ptr + 1))); +#define UPDATE_C1(ptr, value) *ptr = TO16(TO32(*ptr) + (value) * alpha) + +#endif + +#else + +#ifdef ALPHA_ONE + +#define UPDATE_C(PG, PTR, DST, SRC) \ + do { \ + DST = svld1_f32((PG), (PTR)); \ + DST = svadd_z((PG), SRC, DST); \ + svst1_f32((PG), (PTR), DST); \ + } while (0); +#define UPDATE_C2(ptr, tmp, vector) \ + tmp = vld1_f32(ptr); \ + tmp = vadd_f32(vector, tmp); \ + vst1_f32(ptr, tmp); +#define UPDATE_C1(ptr, value) *ptr = *ptr + (value) + +#else + +#define UPDATE_C(PG, PTR, DST, SRC) \ + do { \ + DST = svld1_f32((PG), (PTR)); \ + DST = svmad_z((PG), svalpha, SRC, DST); \ + svst1_f32((PG), (PTR), DST); \ + } while (0); +#define UPDATE_C2(ptr, tmp, vector) \ + tmp = vld1_f32(ptr); \ + tmp = vmla_n_f32(tmp, vector, alpha); \ + vst1_f32(ptr, tmp); +#define UPDATE_C1(ptr, value) *ptr = *ptr + (value) * alpha + +#endif + +#endif + +#ifdef BGEMM +#define OUTPUT_FLOAT bfloat16_t +#else +#define OUTPUT_FLOAT float +#endif + +#ifdef ALPHA_ONE +static int bgemm_kernel_neoversev1_alpha_one(BLASLONG m, BLASLONG n, BLASLONG k, + float alpha, IFLOAT *AA, IFLOAT *BB, + FLOAT *CC, BLASLONG ldc) +#else +static int bgemm_kernel_neoversev1_alpha(BLASLONG m, BLASLONG n, BLASLONG k, + float alpha, IFLOAT *AA, IFLOAT *BB, FLOAT *CC, + BLASLONG ldc) +#endif +{ + BLASLONG pad_k = (k + 3) & ~3; + +#ifndef ALPHA_ONE + svfloat32_t svalpha = svdup_f32(alpha); +#endif + + bfloat16_t *ptr_a = (bfloat16_t *)AA; + bfloat16_t *ptr_b = (bfloat16_t *)BB; + OUTPUT_FLOAT *ptr_c =(OUTPUT_FLOAT*)CC; + + bfloat16_t *ptr_a0; + bfloat16_t *ptr_b0; + OUTPUT_FLOAT *ptr_c0, *ptr_c1, *ptr_c2, *ptr_c3; + svfloat32_t tmp0, tmp1, tmp2, tmp3; +#ifdef BGEMM + svbfloat16_t zeros = svdup_n_bf16(TO16(0.0)); + svbfloat16_t svtmp16; +#else + float32x2_t tmp4, tmp5, tmp6, tmp7; +#endif + + const int sve_size_bf16 = svcnth(); + const int num_accumulators = sve_size_bf16 >> 1; + + svbool_t pgtrue = svptrue_b16(); +#ifdef BGEMM + // For BF16 load/store we use half the vector size + svbool_t pghalf = svwhilelt_b16(0, num_accumulators); +#endif + + // N values are 4x2 packed matrices + int n_step = 0; + const int n2 = n & -2; + const int n4 = n & -4; + + // For 256-bit this would be 8 + const int m_acc = (m & -num_accumulators); + const int m2 = m & -2; + + for (; n_step < n4; n_step += 4) { + ptr_a = (bfloat16_t *)AA; + ptr_c0 = ptr_c; + ptr_c1 = ptr_c0 + ldc; + ptr_c2 = ptr_c1 + ldc; + ptr_c3 = ptr_c2 + ldc; + ptr_c += 4 * ldc; + + int m_step = 0; + for (; m_step < m_acc; m_step += num_accumulators) { + svfloat32_t acc0 = svdup_f32(0); + svfloat32_t acc1 = svdup_f32(0); + svfloat32_t acc2 = svdup_f32(0); + svfloat32_t acc3 = svdup_f32(0); + + ptr_a0 = ptr_a; + ptr_b0 = ptr_b; + ptr_a += num_accumulators * pad_k; + + // Load entire 2VL block + for (BLASLONG p = 0; p < pad_k; p += 4) { + svbfloat16_t ma0 = svld1_bf16(pgtrue, ptr_a0); + svbfloat16_t ma1 = svld1_bf16(pgtrue, ptr_a0 + sve_size_bf16); + svbfloat16_t mb0 = svld1rq_bf16(pgtrue, ptr_b0); + svbfloat16_t mb1 = svld1rq_bf16(pgtrue, ptr_b0 + 8); + + acc0 = svbfmmla_f32(acc0, mb0, ma0); + acc1 = svbfmmla_f32(acc1, mb0, ma1); + acc2 = svbfmmla_f32(acc2, mb1, ma0); + acc3 = svbfmmla_f32(acc3, mb1, ma1); + + ptr_a0 += sve_size_bf16 * 2; + ptr_b0 += 16; + } + + svfloat32_t out0 = svreinterpret_f32_u64(svuzp1_u64(svreinterpret_u64_f32(acc0), svreinterpret_u64_f32(acc1))); + svfloat32_t out1 = svreinterpret_f32_u64(svuzp2_u64(svreinterpret_u64_f32(acc0), svreinterpret_u64_f32(acc1))); + + svfloat32_t out2 = svreinterpret_f32_u64(svuzp1_u64(svreinterpret_u64_f32(acc2), svreinterpret_u64_f32(acc3))); + svfloat32_t out3 = svreinterpret_f32_u64(svuzp2_u64(svreinterpret_u64_f32(acc2), svreinterpret_u64_f32(acc3))); + + UPDATE_C(pgtrue, ptr_c0, tmp0, out0); + UPDATE_C(pgtrue, ptr_c1, tmp1, out1); + UPDATE_C(pgtrue, ptr_c2, tmp2, out2); + UPDATE_C(pgtrue, ptr_c3, tmp3, out3); + + ptr_c0 += num_accumulators; + ptr_c1 += num_accumulators; + ptr_c2 += num_accumulators; + ptr_c3 += num_accumulators; + } + + for (; m_step < m2; m_step += 2) { + float32x4_t acc0 = {0,0,0,0}; + float32x4_t acc1 = {0,0,0,0}; + + ptr_a0 = ptr_a; + ptr_b0 = ptr_b; + ptr_a += 2 * pad_k; + + for (BLASLONG p = 0; p < pad_k; p += 4) { + bfloat16x8_t ma0 = vld1q_bf16(ptr_a0); + bfloat16x8_t mb0 = vld1q_bf16(ptr_b0); + bfloat16x8_t mb1 = vld1q_bf16(ptr_b0 + 8); + + acc0 = vbfmmlaq_f32(acc0, mb0, ma0); + acc1 = vbfmmlaq_f32(acc1, mb1, ma0); + + ptr_a0 += 8; + ptr_b0 += 16; + } + + UPDATE_C2(ptr_c0, tmp4, vget_low_f32(acc0)); + UPDATE_C2(ptr_c1, tmp5, vget_high_f32(acc0)); + UPDATE_C2(ptr_c2, tmp6, vget_low_f32(acc1)); + UPDATE_C2(ptr_c3, tmp7, vget_high_f32(acc1)); + + ptr_c0 += 2; + ptr_c1 += 2; + ptr_c2 += 2; + ptr_c3 += 2; + } + + // Final row is always a contiguous single row + if (m & 1) { + ptr_a0 = ptr_a; + ptr_b0 = ptr_b; + float32x4_t acc0 = {0,0,0,0}; + float32x4_t acc1 = {0,0,0,0}; + + for (BLASLONG p = 0; p < pad_k; p += 4) { + /// Same A value can be used for both B values + bfloat16x8_t ma0 = vreinterpretq_bf16_u64(vdupq_n_u64( + *((uint64_t*)ptr_a0) + )); + bfloat16x8_t mb0 = vld1q_bf16(ptr_b0); + bfloat16x8_t mb1 = vld1q_bf16(ptr_b0 + 8); + + acc0 = vbfmmlaq_f32(acc0, mb0, ma0); + acc1 = vbfmmlaq_f32(acc1, mb1, ma0); + + ptr_a0 += 4; + ptr_b0 += 16; + } + + UPDATE_C1(ptr_c0, acc0[1]); + UPDATE_C1(ptr_c1, acc0[3]); + UPDATE_C1(ptr_c2, acc1[1]); + UPDATE_C1(ptr_c3, acc1[3]); + } + + ptr_b += 4 * pad_k; + } + + for (; n_step < n2; n_step += 2) { + ptr_a = (bfloat16_t *)AA; + ptr_c0 = ptr_c; + ptr_c1 = ptr_c0 + ldc; + ptr_c += 2 * ldc; + + // Sets of two are contiguously packed so yay + int m_step = 0; + for (; m_step < m_acc; m_step += num_accumulators) { + svfloat32_t acc0 = svdup_f32(0); + svfloat32_t acc1 = svdup_f32(0); + ptr_a0 = ptr_a; + ptr_b0 = ptr_b; + ptr_a += num_accumulators * pad_k; + + // Load entire 8x4 block + for (BLASLONG p = 0; p < pad_k; p += 4) { + svbfloat16_t ma0 = svld1_bf16(pgtrue, ptr_a0); + svbfloat16_t ma1 = svld1_bf16(pgtrue, ptr_a0 + sve_size_bf16); + svbfloat16_t mb0 = svld1rq_bf16(pgtrue, ptr_b0); + + acc0 = svbfmmla_f32(acc0, mb0, ma0); + acc1 = svbfmmla_f32(acc1, mb0, ma1); + + ptr_a0 += sve_size_bf16 * 2; + ptr_b0 += 8; + } + + svfloat32_t out0 = svreinterpret_f32_u64(svuzp1_u64(svreinterpret_u64_f32(acc0), svreinterpret_u64_f32(acc1))); + svfloat32_t out1 = svreinterpret_f32_u64(svuzp2_u64(svreinterpret_u64_f32(acc0), svreinterpret_u64_f32(acc1))); + + UPDATE_C(pgtrue, ptr_c0, tmp0, out0); + UPDATE_C(pgtrue, ptr_c1, tmp1, out1); + + ptr_c0 += num_accumulators; + ptr_c1 += num_accumulators; + } + + for (; m_step < m2; m_step += 2) { + float32x4_t acc = {0,0,0,0}; + + ptr_a0 = ptr_a; + ptr_b0 = ptr_b; + ptr_a += 2 * pad_k; + + for (BLASLONG p = 0; p < pad_k; p += 4) { + bfloat16x8_t ma0 = vld1q_bf16(ptr_a0); + bfloat16x8_t mb0 = vld1q_bf16(ptr_b0); + + acc = vbfmmlaq_f32(acc, mb0, ma0); + + ptr_a0 += 8; + ptr_b0 += 8; + } + + UPDATE_C2(ptr_c0, tmp4, vget_low_f32(acc)); + UPDATE_C2(ptr_c1, tmp5, vget_high_f32(acc)); + + ptr_c0 += 2; + ptr_c1 += 2; + } + + // Final row is always a contiguous single row + if (m & 1) { + ptr_a0 = ptr_a; + ptr_b0 = ptr_b; + float32x4_t acc = {0,0,0,0}; + + for (BLASLONG p = 0; p < pad_k; p += 4) { + /// Same A value can be used for both B values + bfloat16x8_t ma0 = vreinterpretq_bf16_u64(vdupq_n_u64( + *((uint64_t*)ptr_a0) + )); + bfloat16x8_t mb0 = vld1q_bf16(ptr_b0); + + acc = vbfmmlaq_f32(acc, mb0, ma0); + + ptr_a0 += 4; + ptr_b0 += 8; + } + + UPDATE_C1(ptr_c0, acc[0]); + UPDATE_C1(ptr_c1, acc[2]); + } + + ptr_b += 2 * pad_k; + } + + if (n & 1) { + ptr_a = (bfloat16_t *)AA; + ptr_c0 = ptr_c; + + int m_step = 0; + for (; m_step < m_acc; m_step += num_accumulators) { + ptr_a0 = ptr_a; + ptr_b0 = ptr_b; + ptr_a += num_accumulators * pad_k; + + svfloat32_t acc0 = svdup_f32(0); + svfloat32_t acc1 = svdup_f32(0); + + // Load entire 8x4 block + for (BLASLONG p = 0; p < pad_k; p += 4) { + uint64_t* ptr_b0_u64 = (uint64_t*)ptr_b0; + + svbfloat16_t ma0 = svld1_bf16(pgtrue, ptr_a0); + svbfloat16_t ma1 = svld1_bf16(pgtrue, ptr_a0 + sve_size_bf16); + svbfloat16_t mb0 = svreinterpret_bf16_u64(svdup_u64(*ptr_b0_u64)); + + acc0 = svbfmmla_f32(acc0, mb0, ma0); + acc1 = svbfmmla_f32(acc1, mb0, ma1); + + ptr_a0 += sve_size_bf16 * 2; + ptr_b0 += 4; + } + + svfloat32_t out0 = svreinterpret_f32_u64(svuzp1_u64(svreinterpret_u64_f32(acc0), svreinterpret_u64_f32(acc1))); + UPDATE_C(pgtrue, ptr_c0, tmp0, out0); + + ptr_c0 += num_accumulators; + } + + for (; m_step < m2; m_step += 2) { + float32x4_t acc = {0, 0, 0, 0}; + + ptr_a0 = ptr_a; + ptr_b0 = ptr_b; + ptr_a += 2 * pad_k; + + for (BLASLONG p = 0; p < pad_k; p += 4) { + bfloat16x8_t ma0 = vld1q_bf16(ptr_a0); + bfloat16x8_t mb0 = vcombine_bf16(vld1_bf16(ptr_b0), vdup_n_bf16(vcvth_bf16_f32(0.0f))); + + acc = vbfmmlaq_f32(acc, mb0, ma0); + + ptr_a0 += 8; + ptr_b0 += 4; + } + + UPDATE_C2(ptr_c0, tmp4, vget_low_f32(acc)); + + ptr_c0 += 2; + } + + if (m & 1) { + ptr_a0 = ptr_a; + ptr_b0 = ptr_b; + float32x2_t acc = {0,0}; + + for (BLASLONG p = 0; p < pad_k; p += 4) { + bfloat16x4_t ma0 = vld1_bf16(ptr_a0); + bfloat16x4_t mb0 = vld1_bf16(ptr_b0); + + acc = vbfdot_f32(acc, ma0, mb0); + + ptr_a0 += 4; + ptr_b0 += 4; + } + + UPDATE_C1(ptr_c0, acc[0] + acc[1]); + } + } + + return 0; +} diff --git a/kernel/arm64/bgemm_ncopy_2vl_neoversev1.c b/kernel/arm64/bgemm_ncopy_2vl_neoversev1.c new file mode 100644 index 0000000000..e67d45c8a9 --- /dev/null +++ b/kernel/arm64/bgemm_ncopy_2vl_neoversev1.c @@ -0,0 +1,135 @@ +/*************************************************************************** + * Copyright (c) 2025, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include +#include + +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG m, IFLOAT *input, BLASLONG lda, IFLOAT *output) { + const int sve_size_bf16 = svcnth(); + const int num_accumulators = sve_size_bf16 >> 1; + const int m_sve_accumulators = m & -num_accumulators; + + const int n4 = n & -4; + const int n_rest = n - n4; + + const int m2 = m & -2; + const int m_rest = m - m2; + + size_t m_step = 0; + + for (; m_step < m_sve_accumulators; m_step += num_accumulators) { + const uint16_t* inner_input = input; + + // Potential for vld1q here with transpose + for (int n_step = 0; n_step < n4; n_step += 4) { + for (int line = 0; line < num_accumulators; line += 4) { + uint16x4_t a_vec0 = vld1_u16(inner_input + line * lda); + uint16x4_t a_vec1 = vld1_u16(inner_input + (line + 1) * lda); + uint16x4_t a_vec2 = vld1_u16(inner_input + (line + 2) * lda); + uint16x4_t a_vec3 = vld1_u16(inner_input + (line + 3) * lda); + + vst1_u16(output, a_vec0); + vst1_u16(output + 4, a_vec1); + vst1_u16(output + 8, a_vec2); + vst1_u16(output + 12, a_vec3); + + output += 16; + } + + inner_input += 4; + } + + // Bit of padding up to 4 for any remaining K + // by the time we get here we hope the memory bandwidth is saturated + if (n_rest) { + for (BLASLONG line = 0; line < num_accumulators; line++) { + output[0] = inner_input[0]; + output[1] = n_rest == 1 ? 0 : inner_input[1]; + output[2] = n_rest <= 2 ? 0 : inner_input[2]; + output[3] = n_rest <= 3 ? 0 : inner_input[3]; + + inner_input += lda; + output += 4; + } + } + + input += lda * num_accumulators; + } + + // Any remaining blocks are done 2 at a time for ASIMD processing + for (; m_step < m2; m_step += 2) { + const uint16_t* inner_input = input; + for (size_t n_step = 0; n_step < n4; n_step += 4) { + uint16x4_t a_vec0 = vld1_u16(inner_input); + uint16x4_t a_vec1 = vld1_u16(inner_input + lda); + + vst1_u16(output, a_vec0); + vst1_u16(output + 4, a_vec1); + + inner_input += 4; + output += 8; + } + + if (n_rest) { + for (BLASLONG line = 0; line < 2; line++) { + output[0] = inner_input[0]; + output[1] = n_rest == 1 ? 0 : inner_input[1]; + output[2] = n_rest <= 2 ? 0 : inner_input[2]; + output[3] = n_rest <= 3 ? 0 : inner_input[3]; + + inner_input += lda; + output += 4; + } + } + + input += lda * 2; + } + + // Final row is just there + if (m_rest & 1) { + for (size_t n_step = 0; n_step < n4; n_step += 4) { + uint16x4_t a_vec0 = vld1_u16(input); + + vst1_u16(output, a_vec0); + + input += 4; + output += 4; + } + + if (n_rest) { + output[0] = input[0]; + output[1] = n_rest == 1 ? 0 : input[1]; + output[2] = n_rest <= 2 ? 0 : input[2]; + output[3] = n_rest <= 3 ? 0 : input[3]; + } + } + + return 0; +} diff --git a/kernel/arm64/bgemm_ncopy_4_neoversev1.c b/kernel/arm64/bgemm_ncopy_4_neoversev1.c new file mode 100644 index 0000000000..fc68d35d92 --- /dev/null +++ b/kernel/arm64/bgemm_ncopy_4_neoversev1.c @@ -0,0 +1,124 @@ +/*************************************************************************** + * Copyright (c) 2025, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include +#include + +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG m, IFLOAT *input, BLASLONG lda, IFLOAT *output) { + const int num_accumulators = 4; + const int m_accumulators = m & -4; + + const int n4 = n & -4; + const int n_rest = n - n4; + + const int m_rest = m - m_accumulators; + + for (size_t m_step = 0; m_step < m_accumulators; m_step += num_accumulators) { + const uint16_t* inner_input = input; + + // Potential for vld1q here with transpose + for (size_t n_step = 0; n_step < n4; n_step += 4) { + uint16x4_t a_vec0 = vld1_u16(inner_input + 0 * lda); + uint16x4_t a_vec1 = vld1_u16(inner_input + 1 * lda); + uint16x4_t a_vec2 = vld1_u16(inner_input + 2 * lda); + uint16x4_t a_vec3 = vld1_u16(inner_input + 3 * lda); + + vst1_u16(output, a_vec0); + vst1_u16(output + 4, a_vec1); + vst1_u16(output + 8, a_vec2); + vst1_u16(output + 12, a_vec3); + + output += 16; + inner_input += 4; + } + + if (n_rest) { + for (BLASLONG line = 0; line < num_accumulators; line++) { + output[0] = inner_input[0]; + output[1] = n_rest == 1 ? 0 : inner_input[1]; + output[2] = n_rest <= 2 ? 0 : inner_input[2]; + output[3] = n_rest <= 3 ? 0 : inner_input[3]; + + inner_input += lda; + output += 4; + } + } + + input += lda * num_accumulators; + } + + if (m_rest & 2) { + const uint16_t* inner_input = input; + for (size_t n_step = 0; n_step < n4; n_step += 4) { + uint16x4_t a_vec0 = vld1_u16(inner_input); + uint16x4_t a_vec1 = vld1_u16(inner_input + lda); + + vst1_u16(output, a_vec0); + vst1_u16(output + 4, a_vec1); + + inner_input += 4; + output += 8; + } + + if (n_rest) { + for (BLASLONG line = 0; line < 2; line++) { + output[0] = inner_input[0]; + output[1] = n_rest == 1 ? 0 : inner_input[1]; + output[2] = n_rest <= 2 ? 0 : inner_input[2]; + output[3] = n_rest <= 3 ? 0 : inner_input[3]; + + inner_input += lda; + output += 4; + } + } + + input += lda * 2; + } + + if (m_rest & 1) { + for (size_t n_step = 0; n_step < n4; n_step += 4) { + uint16x4_t a_vec0 = vld1_u16(input); + + vst1_u16(output, a_vec0); + + input += 4; + output += 4; + } + + if (n_rest) { + output[0] = input[0]; + output[1] = n_rest == 1 ? 0 : input[1]; + output[2] = n_rest <= 2 ? 0 : input[2]; + output[3] = n_rest <= 3 ? 0 : input[3]; + } + } + + return 0; +} diff --git a/kernel/arm64/bgemm_tcopy_2vl_neoversev1.c b/kernel/arm64/bgemm_tcopy_2vl_neoversev1.c new file mode 100644 index 0000000000..deb2e9c492 --- /dev/null +++ b/kernel/arm64/bgemm_tcopy_2vl_neoversev1.c @@ -0,0 +1,154 @@ +/*************************************************************************** + * Copyright (c) 2025, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include +#include + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *input, BLASLONG lda, IFLOAT *output) { + const int sve_size_bf16 = svcnth(); + const int num_accumulators_sve = sve_size_bf16 >> 1; + const int num_accumulators = num_accumulators_sve; + const int incr_accumulators = 4; + + const int n_sve_accumulators = (n & -num_accumulators); + const int n2 = n & -2; + const int n_rest = n - n2; + + const int m4 = m & -4; + const int m_rest = m - m4; + + size_t n_step = 0; + + for (; n_step < n_sve_accumulators; n_step += num_accumulators) { + const uint16_t* inner_input = input; + + // Full 4x4 item transposes down the M dimension + for (size_t m_step = 0; m_step < m4; m_step += 4) { + const uint16_t* tile = inner_input; + + for (size_t line = 0; line < num_accumulators; line += incr_accumulators) { + // Load 4x4 block + uint16x4_t a_vec0 = vld1_u16(tile); + uint16x4_t a_vec1 = vld1_u16(tile + lda); + uint16x4_t a_vec2 = vld1_u16(tile + 2 * lda); + uint16x4_t a_vec3 = vld1_u16(tile + 3 * lda); + + // Transpose 4x4 blocks + uint16x4_t out_vec0 = vzip1_u16(a_vec0, a_vec1); + uint16x4_t out_vec1 = vzip2_u16(a_vec0, a_vec1); + uint16x4_t out_vec2 = vzip1_u16(a_vec2, a_vec3); + uint16x4_t out_vec3 = vzip2_u16(a_vec2, a_vec3); + + // Transpose 8x4 blocks + a_vec0 = vreinterpret_u16_u32(vzip1_u32(vreinterpret_u32_u16(out_vec0), vreinterpret_u32_u16(out_vec2))); + a_vec1 = vreinterpret_u16_u32(vzip2_u32(vreinterpret_u32_u16(out_vec0), vreinterpret_u32_u16(out_vec2))); + a_vec2 = vreinterpret_u16_u32(vzip1_u32(vreinterpret_u32_u16(out_vec1), vreinterpret_u32_u16(out_vec3))); + a_vec3 = vreinterpret_u16_u32(vzip2_u32(vreinterpret_u32_u16(out_vec1), vreinterpret_u32_u16(out_vec3))); + + vst1_u16(output, a_vec0); + vst1_u16(output + 4, a_vec1); + vst1_u16(output + 8, a_vec2); + vst1_u16(output + 12, a_vec3); + + tile += incr_accumulators; + output += 16; + } + + inner_input += incr_accumulators * lda; + } + + if (m_rest) { + for (BLASLONG line = 0; line < num_accumulators; line++) { + output[0] = inner_input[0]; + output[1] = m_rest == 1 ? 0 : *(inner_input + lda); + output[2] = m_rest <= 2 ? 0 : *(inner_input + 2 * lda); + output[3] = m_rest <= 3 ? 0 : *(inner_input + 3 * lda); + + inner_input++; + output += 4; + } + } + + input += num_accumulators; + } + + for (; n_step < n2; n_step += 2) { + const uint16_t* inner_input = input; + for (size_t m_step = 0; m_step < m4; m_step += 4) { + for (BLASLONG line = 0; line < 2; line++) { + output[0] = *(inner_input + line); + output[1] = *(inner_input + line + lda); + output[2] = *(inner_input + line + 2 * lda); + output[3] = *(inner_input + line + 3 * lda); + + output += 4; + } + + inner_input += 4 * lda; + } + + if (m_rest) { + for (BLASLONG line = 0; line < 2; line++) { + output[0] = *(inner_input + line); + output[1] = m_rest == 1 ? 0 : *(inner_input + line + lda); + output[2] = m_rest <= 2 ? 0 : *(inner_input + line + 2 * lda); + output[3] = m_rest <= 3 ? 0 : *(inner_input + line + 3 * lda); + + output += 4; + } + } + + input += 2; + } + + if (n_rest & 1) { + const uint16_t* inner_input = input; + for (size_t m_step = 0; m_step < m4; m_step += 4) { + output[0] = *inner_input; + output[1] = *(inner_input + lda); + output[2] = *(inner_input + 2 * lda); + output[3] = *(inner_input + 3 * lda); + + inner_input += 4 * lda; + output += 4; + } + + if (m_rest) { + output[0] = inner_input[0]; + output[1] = m_rest == 1 ? 0 : *(inner_input + lda); + output[2] = m_rest <= 2 ? 0 : *(inner_input + 2 * lda); + output[3] = m_rest <= 3 ? 0 : *(inner_input + 3 * lda); + + output += 4; + } + } + + return 0; +} diff --git a/kernel/arm64/bgemm_tcopy_4_neoversev1.c b/kernel/arm64/bgemm_tcopy_4_neoversev1.c new file mode 100644 index 0000000000..49a6eca01a --- /dev/null +++ b/kernel/arm64/bgemm_tcopy_4_neoversev1.c @@ -0,0 +1,143 @@ +/*************************************************************************** + * Copyright (c) 2025, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include +#include + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *input, BLASLONG lda, IFLOAT *output) { + const int num_accumulators = 4; + const int n_accumulators = (n & -num_accumulators); + const int n_rest = n - n_accumulators; + + const int m4 = m & -4; + const int m_rest = m - m4; + + for (size_t n_step = 0; n_step < n_accumulators; n_step += num_accumulators) { + const uint16_t* inner_input = input; + + // Full 4x4 item transposes down the M dimension + for (size_t m_step = 0; m_step < m4; m_step += 4) { + // Load 4x4 block + uint16x4_t a_vec0 = vld1_u16(inner_input); + uint16x4_t a_vec1 = vld1_u16(inner_input + lda); + uint16x4_t a_vec2 = vld1_u16(inner_input + 2 * lda); + uint16x4_t a_vec3 = vld1_u16(inner_input + 3 * lda); + + // Transpose 4x4 blocks + uint16x4_t out_vec0 = vzip1_u16(a_vec0, a_vec1); + uint16x4_t out_vec1 = vzip2_u16(a_vec0, a_vec1); + uint16x4_t out_vec2 = vzip1_u16(a_vec2, a_vec3); + uint16x4_t out_vec3 = vzip2_u16(a_vec2, a_vec3); + + // Transpose 8x4 blocks + a_vec0 = vreinterpret_u16_u32(vzip1_u32(vreinterpret_u32_u16(out_vec0), vreinterpret_u32_u16(out_vec2))); + a_vec1 = vreinterpret_u16_u32(vzip2_u32(vreinterpret_u32_u16(out_vec0), vreinterpret_u32_u16(out_vec2))); + a_vec2 = vreinterpret_u16_u32(vzip1_u32(vreinterpret_u32_u16(out_vec1), vreinterpret_u32_u16(out_vec3))); + a_vec3 = vreinterpret_u16_u32(vzip2_u32(vreinterpret_u32_u16(out_vec1), vreinterpret_u32_u16(out_vec3))); + + vst1_u16(output, a_vec0); + vst1_u16(output + 4, a_vec1); + vst1_u16(output + 8, a_vec2); + vst1_u16(output + 12, a_vec3); + + inner_input += 4 * lda; + output += 16; + } + + if (m_rest) { + for (BLASLONG line = 0; line < num_accumulators; line++) { + output[0] = inner_input[0]; + output[1] = m_rest == 1 ? 0 : *(inner_input + lda); + output[2] = m_rest <= 2 ? 0 : *(inner_input + 2 * lda); + output[3] = m_rest <= 3 ? 0 : *(inner_input + 3 * lda); + + inner_input++; + output += 4; + } + } + + input += num_accumulators; + } + + // Extract two remaining rows as 128-bit vector paired + if (n_rest & 2) { + const uint16_t* inner_input = input; + for (size_t m_step = 0; m_step < m4; m_step += 4) { + for (BLASLONG line = 0; line < 2; line++) { + output[0] = *(inner_input + line); + output[1] = *(inner_input + line + lda); + output[2] = *(inner_input + line + 2 * lda); + output[3] = *(inner_input + line + 3 * lda); + + output += 4; + } + + inner_input += 4 * lda; + } + + if (m_rest) { + for (BLASLONG line = 0; line < 2; line++) { + output[0] = *(inner_input + line); + output[1] = m_rest == 1 ? 0 : *(inner_input + line + lda); + output[2] = m_rest <= 2 ? 0 : *(inner_input + line + 2 * lda); + output[3] = m_rest <= 3 ? 0 : *(inner_input + line + 3 * lda); + + output += 4; + } + } + + input += 2; + } + + // Flatten final row + if (n_rest & 1) { + const uint16_t* inner_input = input; + for (size_t m_step = 0; m_step < m4; m_step += 4) { + output[0] = *inner_input; + output[1] = *(inner_input + lda); + output[2] = *(inner_input + 2 * lda); + output[3] = *(inner_input + 3 * lda); + + inner_input += 4 * lda; + output += 4; + } + + if (m_rest) { + output[0] = inner_input[0]; + output[1] = m_rest == 1 ? 0 : *(inner_input + lda); + output[2] = m_rest <= 2 ? 0 : *(inner_input + 2 * lda); + output[3] = m_rest <= 3 ? 0 : *(inner_input + 3 * lda); + + output += 4; + } + } + + return 0; +} diff --git a/kernel/arm64/bgemv_n_sve_v3x4.c b/kernel/arm64/bgemv_n_sve_v3x4.c new file mode 100644 index 0000000000..b5e5f76a23 --- /dev/null +++ b/kernel/arm64/bgemv_n_sve_v3x4.c @@ -0,0 +1,322 @@ +/*************************************************************************** +Copyright (c) 2025 The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#include +#include + +#define UPDATE_PTRSx2 \ + a_ptr1 = a_ptr0 + lda; + +#define UPDATE_PTRSx4 \ + UPDATE_PTRSx2 \ + a_ptr2 = a_ptr1 + lda; \ + a_ptr3 = a_ptr2 + lda; + +#define UPDATE_PTRSx8 \ + UPDATE_PTRSx4 \ + a_ptr4 = a_ptr3 + lda; \ + a_ptr5 = a_ptr4 + lda; \ + a_ptr6 = a_ptr5 + lda; \ + a_ptr7 = a_ptr6 + lda; + +#define LANESx2(MACRO, offset) \ + MACRO(offset, 0) \ + MACRO(offset, 1) + +#define LANESx4(MACRO, offset) \ + LANESx2(MACRO, offset) \ + MACRO(offset, 2) \ + MACRO(offset, 3) + +#define LANESx8(MACRO, offset) \ + LANESx4(MACRO, offset) \ + MACRO(offset, 4) \ + MACRO(offset, 5) \ + MACRO(offset, 6) \ + MACRO(offset, 7) + +#define LOAD_A_VEC(offset, vec) \ + svbfloat16_t a_vec ## offset ## vec = svld1(pg_full, &a_ptr ## vec[i] + offset * sve_size_bf16); + +#define UPDATE_ACCUMULATORS_FROM_LANE(offset, lane) \ + acc ## offset ## 0 = svbfmlalb_lane(acc ## offset ## 0, a_vec ## offset ## lane, x_vec, lane); \ + acc ## offset ## 1 = svbfmlalt_lane(acc ## offset ## 1, a_vec ## offset ## lane, x_vec, lane); + +#define INIT_ACCUMULATORS(offset) \ + svfloat32_t acc ## offset ## 0 = svdup_f32(0.0); \ + svfloat32_t acc ## offset ## 1 = svdup_f32(0.0); + +#define UPDATE_ACCUMULATORS(offset) \ + acc ## offset ## 0 = svbfmlalb(acc ## offset ## 0, a_vec ## offset ## 0, x_vec); \ + acc ## offset ## 1 = svbfmlalt(acc ## offset ## 1, a_vec ## offset ## 0, x_vec); + +#define STORE_ACCUMULATORS(offset) \ + svbfloat16_t acc ## offset ## 0_bf16 = svcvt_bf16_x(pg_full, acc ## offset ## 0); \ + svbfloat16_t acc ## offset ## 1_bf16 = svcvt_bf16_x(pg_full, acc ## offset ## 1); \ + svbfloat16_t combined ## offset = svtrn1(acc ## offset ## 0_bf16, acc ## offset ## 1_bf16); \ + svst1(pg_full, &y[i] + offset * sve_size_bf16, combined ## offset); + +#define ALPHA_OP(offset) \ + acc ## offset ## 0 = svmul_x(pg_full, acc ## offset ## 0, svalpha); \ + acc ## offset ## 1 = svmul_x(pg_full, acc ## offset ## 1, svalpha); + +#define BETA_OP(offset) \ + svbfloat16_t y_vec ## offset = svld1(pg_full, &y[i] + offset * sve_size_bf16); \ + acc ## offset ## 0 = svbfmlalb(acc ## offset ## 0, svbeta16, y_vec ## offset); \ + acc ## offset ## 1 = svbfmlalt(acc ## offset ## 1, svbeta16, y_vec ## offset); + +int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, IFLOAT *a_in, BLASLONG lda, IFLOAT *x_in, BLASLONG inc_x, FLOAT beta, FLOAT *y_in, BLASLONG inc_y) +{ + BLASLONG ix; + bfloat16_t *a_ptr0, *a_ptr1, *a_ptr2, *a_ptr3, *a_ptr4, *a_ptr5, *a_ptr6, *a_ptr7; + BLASLONG sve_size_bf16 = svcnth(); + BLASLONG sve_size2_bf16 = sve_size_bf16 * 2; + BLASLONG sve_size3_bf16 = sve_size_bf16 * 3; + svbool_t pg_full = svptrue_b16(); + svbool_t pg_tail = svwhilelt_b16_s64(0, m % sve_size_bf16); + + BLASLONG n8 = n & -8; + BLASLONG n4 = n & -4; + BLASLONG n2 = n & -2; + + bfloat16_t *a = (bfloat16_t*)a_in; + bfloat16_t *x = (bfloat16_t*)x_in; + bfloat16_t *y = (bfloat16_t*)y_in; + + bfloat16_t alpha_bf16, beta_bf16; + memcpy(&alpha_bf16, &alpha, sizeof(bfloat16_t)); + memcpy(&beta_bf16, &beta, sizeof(bfloat16_t)); + float beta_f32 = vcvtah_f32_bf16(beta_bf16); + float alpha_f32 = vcvtah_f32_bf16(alpha_bf16); + svfloat32_t svalpha = svdup_f32(vcvtah_f32_bf16(alpha_bf16)); + svbfloat16_t svbeta16 = svdup_bf16(beta_bf16); + + BLASLONG i = 0; + if (inc_y == 1) { + for (; (i + sve_size3_bf16 - 1) < m; i+= sve_size3_bf16) { + INIT_ACCUMULATORS(0); + INIT_ACCUMULATORS(1); + INIT_ACCUMULATORS(2); + + BLASLONG j = 0; + ix = 0; + a_ptr0 = a; + UPDATE_PTRSx8; + + if (inc_x == 1) { + for (; j < n4; j+= 4) { + uint64_t* x_u64 = (uint64_t*)&x[ix]; + svbfloat16_t x_vec = svreinterpret_bf16_u64(svdup_u64(*x_u64)); + + LANESx4(LOAD_A_VEC, 0); + LANESx4(LOAD_A_VEC, 1); + LANESx4(LOAD_A_VEC, 2); + LANESx4(UPDATE_ACCUMULATORS_FROM_LANE, 0); + LANESx4(UPDATE_ACCUMULATORS_FROM_LANE, 1); + LANESx4(UPDATE_ACCUMULATORS_FROM_LANE, 2); + + ix += 4; + + a_ptr0 += 4 * lda; + UPDATE_PTRSx4; + } + + for (; j < n2; j+= 2) { + uint32_t* x_u32 = (uint32_t*)&x[ix]; + svbfloat16_t x_vec = svreinterpret_bf16_u32(svdup_u32(*x_u32)); + + LANESx2(LOAD_A_VEC, 0); + LANESx2(LOAD_A_VEC, 1); + LANESx2(LOAD_A_VEC, 2); + LANESx2(UPDATE_ACCUMULATORS_FROM_LANE, 0); + LANESx2(UPDATE_ACCUMULATORS_FROM_LANE, 1); + LANESx2(UPDATE_ACCUMULATORS_FROM_LANE, 2); + + ix += 2; + + a_ptr0 += 2 * lda; + UPDATE_PTRSx2; + } + } + + for (; j < n; j++) { + svbfloat16_t x_vec = svdup_bf16(x[ix]); + LOAD_A_VEC(0, 0); + LOAD_A_VEC(1, 0); + LOAD_A_VEC(2, 0); + UPDATE_ACCUMULATORS(0); + UPDATE_ACCUMULATORS(1); + UPDATE_ACCUMULATORS(2); + + ix += inc_x; + a_ptr0 += lda; + } + + if (alpha_f32 != ONE) { + ALPHA_OP(0); + ALPHA_OP(1); + ALPHA_OP(2); + } + if (beta_f32 != ZERO) { + BETA_OP(0); + BETA_OP(1); + BETA_OP(2); + } + + STORE_ACCUMULATORS(0); + STORE_ACCUMULATORS(1); + STORE_ACCUMULATORS(2); + } + + for (; (i + sve_size_bf16 - 1) < m; i+= sve_size_bf16) { + INIT_ACCUMULATORS(0); + + BLASLONG j = 0; + ix = 0; + a_ptr0 = a; + UPDATE_PTRSx8; + + if (inc_x == 1) { + for (; j < n8; j+= 8) { + svbfloat16_t x_vec = svld1rq(pg_full, &x[ix]); + LANESx8(LOAD_A_VEC, 0); + LANESx8(UPDATE_ACCUMULATORS_FROM_LANE, 0); + + ix += 8; + + a_ptr0 += 8 * lda; + UPDATE_PTRSx8; + } + + for (; j < n4; j+= 4) { + uint64_t* x_u64 = (uint64_t*)&x[ix]; + svbfloat16_t x_vec = svreinterpret_bf16_u64(svdup_u64(*x_u64)); + + LANESx4(LOAD_A_VEC, 0); + LANESx4(UPDATE_ACCUMULATORS_FROM_LANE, 0); + + ix += 4; + + a_ptr0 += 4 * lda; + UPDATE_PTRSx4; + } + + for (; j < n2; j+= 2) { + uint32_t* x_u32 = (uint32_t*)&x[ix]; + svbfloat16_t x_vec = svreinterpret_bf16_u32(svdup_u32(*x_u32)); + + LANESx2(LOAD_A_VEC, 0); + LANESx2(UPDATE_ACCUMULATORS_FROM_LANE, 0); + + ix += 2; + + a_ptr0 += 2 * lda; + UPDATE_PTRSx2; + } + } + + for (; j < n; j++) { + svbfloat16_t x_vec = svdup_bf16(x[ix]); + LOAD_A_VEC(0, 0); + UPDATE_ACCUMULATORS(0); + + ix += inc_x; + a_ptr0 += lda; + } + + if (alpha_f32 != ONE) { + ALPHA_OP(0); + } + if (beta_f32 != ZERO) { + BETA_OP(0); + } + + STORE_ACCUMULATORS(0); + } + + if (i < m) { + svfloat32_t acc0 = svdup_f32(0.0); + svfloat32_t acc1 = svdup_f32(0.0); + + ix = 0; + a_ptr0 = a; + for (BLASLONG j = 0; j < n; j++) { + svbfloat16_t x_vec0 = svdup_bf16(x[ix]); + svbfloat16_t a_vec0 = svld1(pg_tail, &a_ptr0[i]); + + acc0 = svbfmlalb(acc0, a_vec0, x_vec0); + acc1 = svbfmlalt(acc1, a_vec0, x_vec0); + + ix += inc_x; + a_ptr0 += lda; + } + + if (alpha_f32 != ONE) { + acc0 = svmul_x(pg_full, acc0, svalpha); + acc1 = svmul_x(pg_full, acc1, svalpha); + } + if (beta_f32 != ZERO) { + svbfloat16_t y_vec = svld1(pg_tail, &y[i]); + acc0 = svbfmlalb(acc0, svbeta16, y_vec); + acc1 = svbfmlalt(acc1, svbeta16, y_vec); + } + + svbfloat16_t acc0_bf16 = svcvt_bf16_x(pg_full, acc0); + svbfloat16_t acc1_bf16 = svcvt_bf16_x(pg_full, acc1); + svbfloat16_t combined = svtrn1(acc0_bf16, acc1_bf16); + svst1(pg_tail, &y[i], combined); + } + + return 0; + } + + // Scalar fallback + BLASLONG iy = 0; + for (; i < m; i++) { + float temp = 0.0; + + ix = 0; + a_ptr0 = a; + for (BLASLONG j = 0; j < n; j++) { + temp += vcvtah_f32_bf16(a_ptr0[i]) * vcvtah_f32_bf16(x[ix]); + ix += inc_x; + a_ptr0 += lda; + } + + if (beta_f32 == ZERO) { + y[iy] = vcvth_bf16_f32(alpha_f32 * temp); + } else { + y[iy] = vcvth_bf16_f32(alpha_f32 * temp + beta_f32 * vcvtah_f32_bf16(y[iy])); + } + + iy += inc_y; + } + + return (0); +} diff --git a/kernel/arm64/dot.c b/kernel/arm64/dot.c index ece31bccdf..1e49ffdd60 100644 --- a/kernel/arm64/dot.c +++ b/kernel/arm64/dot.c @@ -40,8 +40,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef USE_SVE +#ifdef DOT_KERNEL_SVE +#include DOT_KERNEL_SVE +#elif defined(A64FX) +#include "dot_kernel_sve_v8.c" +#else #include "dot_kernel_sve.c" #endif +#endif #include "dot_kernel_asimd.c" #if defined(SMP) @@ -78,14 +84,43 @@ static inline int get_dot_optimal_nthreads_neoversev1(BLASLONG N, int ncpu) { } #endif +#if defined(DYNAMIC_ARCH) || defined(A64FX) +static inline int get_dot_optimal_nthreads_a64fx(BLASLONG N, int ncpu) { + #ifdef DOUBLE + return (N <= 11000L) ? 1 + : (N <= 20000L) ? MIN(ncpu, 2) + : (N <= 35000L) ? MIN(ncpu, 4) + : (N <= 50000L) ? MIN(ncpu, 6) + : (N <= 440000L) ? MIN(ncpu, 8) + : (N <= 880000L) ? MIN(ncpu, 16) + : (N <= 1020000L) ? MIN(ncpu, 24) + : ncpu; + #else + return (N <= 22000L) ? 1 + : (N <= 39000L) ? MIN(ncpu, 2) + : (N <= 79000L) ? MIN(ncpu, 4) + : (N <= 120000L) ? MIN(ncpu, 6) + : (N <= 1020000L) ? MIN(ncpu, 8) + : ncpu; + #endif +} +#endif + static inline int get_dot_optimal_nthreads(BLASLONG n) { int ncpu = num_cpu_avail(1); -#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(BFLOAT16) +#if defined(A64FX) && !defined(COMPLEX) && !defined(BFLOAT16) + return get_dot_optimal_nthreads_a64fx(n, ncpu); +#elif defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(BFLOAT16) return get_dot_optimal_nthreads_neoversev1(n, ncpu); #elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(BFLOAT16) - if (strcmp(gotoblas_corename(), "neoversev1") == 0) { - return get_dot_optimal_nthreads_neoversev1(n, ncpu); + { + const char *core = gotoblas_corename(); + if (strcmp(core, "a64fx") == 0) { + return get_dot_optimal_nthreads_a64fx(n, ncpu); + } else if (strcmp(core, "neoversev1") == 0) { + return get_dot_optimal_nthreads_neoversev1(n, ncpu); + } } #endif @@ -93,7 +128,7 @@ static inline int get_dot_optimal_nthreads(BLASLONG n) { if (n <= 10000L) return 1; else - return num_cpu_avail(1); + return ncpu; } #endif diff --git a/kernel/arm64/dot_kernel_sve.c b/kernel/arm64/dot_kernel_sve.c index bc9975214b..96485184c1 100644 --- a/kernel/arm64/dot_kernel_sve.c +++ b/kernel/arm64/dot_kernel_sve.c @@ -48,7 +48,7 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " add x12, %[X_], x9, lsl #"SHIFT" \n" \ " add x13, %[Y_], x9, lsl #"SHIFT" \n" #define TAIL_WHILE \ -" whilelo p1."DTYPE", x8, x0 \n" +" whilelo p1."DTYPE", x8, %[N_] \n" #define UPDATE(pg, x,y,out) \ " ld1"WIDTH" { z2."DTYPE" }, "pg"/z, ["x", x8, lsl #"SHIFT"] \n" \ " ld1"WIDTH" { z3."DTYPE" }, "pg"/z, ["y", x8, lsl #"SHIFT"] \n" \ @@ -66,7 +66,7 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " movi d1, #0x0 \n" \ SETUP_TRUE \ " neg x10, x9, lsl #1 \n" \ -" ands x11, x10, x0 \n" \ +" ands x11, x10, %[N_] \n" \ " b.eq 2f // skip_2x \n" \ OFFSET_INPUTS \ "1: // vector_2x \n" \ @@ -78,7 +78,7 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SUM_VECTOR("1") \ "2: // skip_2x \n" \ " neg x10, x9 \n" \ -" and x10, x10, x0 \n" \ +" and x10, x10, %[N_] \n" \ " cmp x8, x10 \n" \ " b.hs 4f // tail \n" \ "3: // vector_1x \n" \ @@ -87,7 +87,7 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " cmp x8, x10 \n" \ " b.lo 3b // vector_1x \n" \ "4: // tail \n" \ -" cmp x10, x0 \n" \ +" cmp x10, %[N_] \n" \ " b.eq 5f // end \n" \ TAIL_WHILE \ UPDATE("p1", "%[X_]", "%[Y_]", "z0") \ @@ -110,9 +110,9 @@ dot_kernel_sve(BLASLONG n, FLOAT* x, FLOAT* y) [Y_] "r" (y) : "cc", "memory", - "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", - "x8", "x9", "x10", "x11", "x12", "x13", "d1", - "z0", "z1" + "x8", "x9", "x10", "x11", "x12", "x13", + "v0", "v1", + "z0", "z1", "z2", "z3" ); return ret; diff --git a/kernel/arm64/dot_kernel_sve_v8.c b/kernel/arm64/dot_kernel_sve_v8.c new file mode 100644 index 0000000000..fa18553b11 --- /dev/null +++ b/kernel/arm64/dot_kernel_sve_v8.c @@ -0,0 +1,146 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#ifdef DOUBLE +#define SV_COUNT svcntd +#define SV_TYPE svfloat64_t +#define SV_TRUE svptrue_b64 +#define SV_WHILE svwhilelt_b64_s64 +#define SV_DUP svdup_f64 +#else +#define SV_COUNT svcntw +#define SV_TYPE svfloat32_t +#define SV_TRUE svptrue_b32 +#define SV_WHILE svwhilelt_b32_s64 +#define SV_DUP svdup_f32 +#endif + +static FLOAT dot_kernel_sve(BLASLONG n, FLOAT* x, FLOAT* y) +{ + SV_TYPE temp0_vec = SV_DUP(0.0); + SV_TYPE temp1_vec = SV_DUP(0.0); + SV_TYPE temp2_vec = SV_DUP(0.0); + SV_TYPE temp3_vec = SV_DUP(0.0); + SV_TYPE temp4_vec = SV_DUP(0.0); + SV_TYPE temp5_vec = SV_DUP(0.0); + SV_TYPE temp6_vec = SV_DUP(0.0); + SV_TYPE temp7_vec = SV_DUP(0.0); + + BLASLONG i = 0; + BLASLONG sve_size = SV_COUNT(); + + while ((i + sve_size * 8 - 1) < n) { + FLOAT *x0_ptr = x + i; + SV_TYPE x0_vec = svld1_vnum(SV_TRUE(), x0_ptr, 0); + SV_TYPE x1_vec = svld1_vnum(SV_TRUE(), x0_ptr, 1); + SV_TYPE x2_vec = svld1_vnum(SV_TRUE(), x0_ptr, 2); + SV_TYPE x3_vec = svld1_vnum(SV_TRUE(), x0_ptr, 3); + SV_TYPE x4_vec = svld1_vnum(SV_TRUE(), x0_ptr, 4); + SV_TYPE x5_vec = svld1_vnum(SV_TRUE(), x0_ptr, 5); + SV_TYPE x6_vec = svld1_vnum(SV_TRUE(), x0_ptr, 6); + SV_TYPE x7_vec = svld1_vnum(SV_TRUE(), x0_ptr, 7); + + FLOAT *y0_ptr = y + i; + SV_TYPE y0_vec = svld1_vnum(SV_TRUE(), y0_ptr, 0); + SV_TYPE y1_vec = svld1_vnum(SV_TRUE(), y0_ptr, 1); + SV_TYPE y2_vec = svld1_vnum(SV_TRUE(), y0_ptr, 2); + SV_TYPE y3_vec = svld1_vnum(SV_TRUE(), y0_ptr, 3); + SV_TYPE y4_vec = svld1_vnum(SV_TRUE(), y0_ptr, 4); + SV_TYPE y5_vec = svld1_vnum(SV_TRUE(), y0_ptr, 5); + SV_TYPE y6_vec = svld1_vnum(SV_TRUE(), y0_ptr, 6); + SV_TYPE y7_vec = svld1_vnum(SV_TRUE(), y0_ptr, 7); + + temp0_vec = svmla_x(SV_TRUE(), temp0_vec, x0_vec, y0_vec); + temp1_vec = svmla_x(SV_TRUE(), temp1_vec, x1_vec, y1_vec); + temp2_vec = svmla_x(SV_TRUE(), temp2_vec, x2_vec, y2_vec); + temp3_vec = svmla_x(SV_TRUE(), temp3_vec, x3_vec, y3_vec); + temp4_vec = svmla_x(SV_TRUE(), temp4_vec, x4_vec, y4_vec); + temp5_vec = svmla_x(SV_TRUE(), temp5_vec, x5_vec, y5_vec); + temp6_vec = svmla_x(SV_TRUE(), temp6_vec, x6_vec, y6_vec); + temp7_vec = svmla_x(SV_TRUE(), temp7_vec, x7_vec, y7_vec); + + i += sve_size * 8; + } + + if (i < n) { + svbool_t pg0 = SV_WHILE(i + sve_size * 0, n); + svbool_t pg1 = SV_WHILE(i + sve_size * 1, n); + svbool_t pg2 = SV_WHILE(i + sve_size * 2, n); + svbool_t pg3 = SV_WHILE(i + sve_size * 3, n); + svbool_t pg4 = SV_WHILE(i + sve_size * 4, n); + svbool_t pg5 = SV_WHILE(i + sve_size * 5, n); + svbool_t pg6 = SV_WHILE(i + sve_size * 6, n); + svbool_t pg7 = SV_WHILE(i + sve_size * 7, n); + + FLOAT *x0_ptr = x + i; + SV_TYPE x0_vec = svld1_vnum(pg0, x0_ptr, 0); + SV_TYPE x1_vec = svld1_vnum(pg1, x0_ptr, 1); + SV_TYPE x2_vec = svld1_vnum(pg2, x0_ptr, 2); + SV_TYPE x3_vec = svld1_vnum(pg3, x0_ptr, 3); + SV_TYPE x4_vec = svld1_vnum(pg4, x0_ptr, 4); + SV_TYPE x5_vec = svld1_vnum(pg5, x0_ptr, 5); + SV_TYPE x6_vec = svld1_vnum(pg6, x0_ptr, 6); + SV_TYPE x7_vec = svld1_vnum(pg7, x0_ptr, 7); + + FLOAT *y0_ptr = y + i; + SV_TYPE y0_vec = svld1_vnum(pg0, y0_ptr, 0); + SV_TYPE y1_vec = svld1_vnum(pg1, y0_ptr, 1); + SV_TYPE y2_vec = svld1_vnum(pg2, y0_ptr, 2); + SV_TYPE y3_vec = svld1_vnum(pg3, y0_ptr, 3); + SV_TYPE y4_vec = svld1_vnum(pg4, y0_ptr, 4); + SV_TYPE y5_vec = svld1_vnum(pg5, y0_ptr, 5); + SV_TYPE y6_vec = svld1_vnum(pg6, y0_ptr, 6); + SV_TYPE y7_vec = svld1_vnum(pg7, y0_ptr, 7); + + temp0_vec = svmla_m(pg0, temp0_vec, x0_vec, y0_vec); + temp1_vec = svmla_m(pg1, temp1_vec, x1_vec, y1_vec); + temp2_vec = svmla_m(pg2, temp2_vec, x2_vec, y2_vec); + temp3_vec = svmla_m(pg3, temp3_vec, x3_vec, y3_vec); + temp4_vec = svmla_m(pg4, temp4_vec, x4_vec, y4_vec); + temp5_vec = svmla_m(pg5, temp5_vec, x5_vec, y5_vec); + temp6_vec = svmla_m(pg6, temp6_vec, x6_vec, y6_vec); + temp7_vec = svmla_m(pg7, temp7_vec, x7_vec, y7_vec); + } + + temp0_vec = svadd_x(SV_TRUE(), temp0_vec, temp1_vec); + temp2_vec = svadd_x(SV_TRUE(), temp2_vec, temp3_vec); + temp4_vec = svadd_x(SV_TRUE(), temp4_vec, temp5_vec); + temp6_vec = svadd_x(SV_TRUE(), temp6_vec, temp7_vec); + temp0_vec = svadd_x(SV_TRUE(), temp0_vec, temp2_vec); + temp4_vec = svadd_x(SV_TRUE(), temp4_vec, temp6_vec); + temp0_vec = svadd_x(SV_TRUE(), temp0_vec, temp4_vec); + + return svaddv(SV_TRUE(), temp0_vec); +} diff --git a/kernel/arm64/gemv_n_sve.c b/kernel/arm64/gemv_n_sve.c index 59a5c85572..c2f4557396 100644 --- a/kernel/arm64/gemv_n_sve.c +++ b/kernel/arm64/gemv_n_sve.c @@ -69,13 +69,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO FLOAT *a2_ptr = a + lda * width * 2; for (j = 0; j < width; j++) { - for (i = 0; (i + sve_size - 1) < m; i += sve_size) { - ix = j * inc_x; - - SV_TYPE x0_vec = SV_DUP(alpha * x[ix + (inc_x * width * 0)]); - SV_TYPE x1_vec = SV_DUP(alpha * x[ix + (inc_x * width * 1)]); - SV_TYPE x2_vec = SV_DUP(alpha * x[ix + (inc_x * width * 2)]); + ix = j * inc_x; + SV_TYPE x0_vec = SV_DUP(alpha * x[ix + (inc_x * width * 0)]); + SV_TYPE x1_vec = SV_DUP(alpha * x[ix + (inc_x * width * 1)]); + SV_TYPE x2_vec = SV_DUP(alpha * x[ix + (inc_x * width * 2)]); + for (i = 0; (i + sve_size - 1) < m; i += sve_size) { SV_TYPE a00_vec = svld1(pg_true, a0_ptr + i); SV_TYPE a01_vec = svld1(pg_true, a1_ptr + i); SV_TYPE a02_vec = svld1(pg_true, a2_ptr + i); @@ -89,10 +88,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO } if (i < m) { - SV_TYPE x0_vec = SV_DUP(alpha * x[ix + (inc_x * width * 0)]); - SV_TYPE x1_vec = SV_DUP(alpha * x[ix + (inc_x * width * 1)]); - SV_TYPE x2_vec = SV_DUP(alpha * x[ix + (inc_x * width * 2)]); - SV_TYPE a00_vec = svld1(pg, a0_ptr + i); SV_TYPE a01_vec = svld1(pg, a1_ptr + i); SV_TYPE a02_vec = svld1(pg, a2_ptr + i); @@ -115,9 +110,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO a_ptr = a2_ptr; for (j = width * 3; j < n; j++) { ix = j * inc_x; + SV_TYPE x_vec = SV_DUP(alpha * x[(ix)]); for (i = 0; (i + sve_size - 1) < m; i += sve_size) { SV_TYPE y_vec = svld1(pg_true, y + i); - SV_TYPE x_vec = SV_DUP(alpha * x[(ix)]); SV_TYPE a_vec = svld1(pg_true, a_ptr + i); y_vec = svmla_x(pg_true, y_vec, a_vec, x_vec); svst1(pg_true, y + i, y_vec); @@ -125,7 +120,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if (i < m) { SV_TYPE y_vec = svld1(pg, y + i); - SV_TYPE x_vec = SV_DUP(alpha * x[(ix)]); SV_TYPE a_vec = svld1(pg, a_ptr + i); y_vec = svmla_m(pg, y_vec, a_vec, x_vec); svst1(pg, y + i, y_vec); diff --git a/kernel/arm64/gemv_n_sve_v1x3.c b/kernel/arm64/gemv_n_sve_v1x3.c index d6aa3d3894..5ab8d3a166 100644 --- a/kernel/arm64/gemv_n_sve_v1x3.c +++ b/kernel/arm64/gemv_n_sve_v1x3.c @@ -52,17 +52,17 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i; - BLASLONG ix,iy; - BLASLONG j; - FLOAT *a_ptr; + BLASLONG i, j; + BLASLONG ix = 0; + BLASLONG iy; + FLOAT *a_ptr = a; FLOAT temp; - ix = 0; - a_ptr = a; - if (inc_y == 1) { - BLASLONG width = (n + 3 - 1) / 3; + BLASLONG width = n / 3; // Only process full 3-column blocks + BLASLONG sve_size = SV_COUNT(); + svbool_t pg_full = SV_TRUE(); + svbool_t pg_tail = SV_WHILE(0, m % sve_size); FLOAT *a0_ptr = a_ptr + lda * width * 0; FLOAT *a1_ptr = a_ptr + lda * width * 1; @@ -73,57 +73,75 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, FLOAT *x2_ptr = x + inc_x * width * 2; for (j = 0; j < width; j++) { - svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); - svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); - svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); + SV_TYPE temp0_vec = SV_DUP(alpha * x0_ptr[ix]); + SV_TYPE temp1_vec = SV_DUP(alpha * x1_ptr[ix]); + SV_TYPE temp2_vec = SV_DUP(alpha * x2_ptr[ix]); - SV_TYPE temp0_vec = ((j + width * 0) < n) ? SV_DUP(alpha * x0_ptr[ix]) : SV_DUP(0.0); - SV_TYPE temp1_vec = ((j + width * 1) < n) ? SV_DUP(alpha * x1_ptr[ix]) : SV_DUP(0.0); - SV_TYPE temp2_vec = ((j + width * 2) < n) ? SV_DUP(alpha * x2_ptr[ix]) : SV_DUP(0.0); i = 0; - BLASLONG sve_size = SV_COUNT(); - while ((i + sve_size * 1 - 1) < m) { - SV_TYPE y0_vec = svld1_vnum(SV_TRUE(), y + i, 0); + while ((i + sve_size - 1) < m) { + SV_TYPE y0_vec = svld1(pg_full, y + i); - SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); - SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); - SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); + SV_TYPE a00_vec = svld1(pg_full, a0_ptr + i); + SV_TYPE a01_vec = svld1(pg_full, a1_ptr + i); + SV_TYPE a02_vec = svld1(pg_full, a2_ptr + i); - y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec); - y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec); - y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec); + y0_vec = svmla_x(pg_full, y0_vec, temp0_vec, a00_vec); + y0_vec = svmla_x(pg_full, y0_vec, temp1_vec, a01_vec); + y0_vec = svmla_x(pg_full, y0_vec, temp2_vec, a02_vec); - svst1_vnum(SV_TRUE(), y + i, 0, y0_vec); - i += sve_size * 1; + svst1(pg_full, y + i, y0_vec); + i += sve_size; } if (i < m) { - svbool_t pg0 = SV_WHILE(i + sve_size * 0, m); - - pg00 = svand_z(SV_TRUE(), pg0, pg00); - pg01 = svand_z(SV_TRUE(), pg0, pg01); - pg02 = svand_z(SV_TRUE(), pg0, pg02); + SV_TYPE y0_vec = svld1(pg_tail, y + i); - SV_TYPE y0_vec = svld1_vnum(pg0, y + i, 0); + SV_TYPE a00_vec = svld1(pg_tail, a0_ptr + i); + SV_TYPE a01_vec = svld1(pg_tail, a1_ptr + i); + SV_TYPE a02_vec = svld1(pg_tail, a2_ptr + i); - SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); - SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); - SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); + y0_vec = svmla_m(pg_tail, y0_vec, temp0_vec, a00_vec); + y0_vec = svmla_m(pg_tail, y0_vec, temp1_vec, a01_vec); + y0_vec = svmla_m(pg_tail, y0_vec, temp2_vec, a02_vec); - y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec); - y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec); - y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec); - - svst1_vnum(pg0, y + i, 0, y0_vec); + svst1(pg_tail, y + i, y0_vec); } a0_ptr += lda; a1_ptr += lda; a2_ptr += lda; ix += inc_x; } + // Handle remaining n % 3 columns + for (j = width * 3; j < n; j++) { + FLOAT *a_col = a + j * lda; + temp = alpha * x[j * inc_x]; + SV_TYPE temp_vec = SV_DUP(temp); + + i = 0; + while ((i + sve_size - 1) < m) { + SV_TYPE y_vec = svld1(pg_full, y + i); + + SV_TYPE a_vec = svld1(pg_full, a_col + i); + + y_vec = svmla_x(pg_full, y_vec, temp_vec, a_vec); + + svst1(pg_full, y + i, y_vec); + i += sve_size; + } + if (i < m) { + SV_TYPE y_vec = svld1(pg_tail, y + i); + + SV_TYPE a_vec = svld1(pg_tail, a_col + i); + + y_vec = svmla_m(pg_tail, y_vec, temp_vec, a_vec); + + svst1(pg_tail, y + i, y_vec); + } + } return(0); } + // Fallback scalar loop for (j = 0; j < n; j++) { temp = alpha * x[ix]; iy = 0; diff --git a/kernel/arm64/ger_sve_v1x3.c b/kernel/arm64/ger_sve_v1x3.c new file mode 100644 index 0000000000..321465a094 --- /dev/null +++ b/kernel/arm64/ger_sve_v1x3.c @@ -0,0 +1,135 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#ifdef DOUBLE +#define SV_COUNT svcntd +#define SV_TYPE svfloat64_t +#define SV_TRUE svptrue_b64 +#define SV_WHILE svwhilelt_b64_s64 +#define SV_DUP svdup_f64 +#else +#define SV_COUNT svcntw +#define SV_TYPE svfloat32_t +#define SV_TRUE svptrue_b32 +#define SV_WHILE svwhilelt_b32_s64 +#define SV_DUP svdup_f32 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, + FLOAT *x, BLASLONG incx, + FLOAT *y, BLASLONG incy, + FLOAT *a, BLASLONG lda, FLOAT *buffer){ + + FLOAT *X = x; + + if (incx != 1) { + X = buffer; + COPY_K(m, x, incx, X, 1); + } + + BLASLONG width = (n + 3 - 1) / 3; + BLASLONG i, j; + BLASLONG sve_size = SV_COUNT(); + + FLOAT *y0_ptr = y + incy * width * 0; + FLOAT *y1_ptr = y + incy * width * 1; + FLOAT *y2_ptr = y + incy * width * 2; + + for (j = 0; j < width; j++) { + svbool_t pg00 = (j + width * 0 < n) ? SV_TRUE() : svpfalse(); + svbool_t pg01 = (j + width * 1 < n) ? SV_TRUE() : svpfalse(); + svbool_t pg02 = (j + width * 2 < n) ? SV_TRUE() : svpfalse(); + + SV_TYPE temp0_vec = (j + width * 0 < n) ? SV_DUP(alpha * *y0_ptr) : SV_DUP(0.0); + SV_TYPE temp1_vec = (j + width * 1 < n) ? SV_DUP(alpha * *y1_ptr) : SV_DUP(0.0); + SV_TYPE temp2_vec = (j + width * 2 < n) ? SV_DUP(alpha * *y2_ptr) : SV_DUP(0.0); + + FLOAT *x_ptr = X; + FLOAT *a0_ptr = a + lda * width * 0 + lda * j; + FLOAT *a1_ptr = a + lda * width * 1 + lda * j; + FLOAT *a2_ptr = a + lda * width * 2 + lda * j; + + i = 0; + while (i + sve_size * 1 - 1 < m) { + SV_TYPE x0_vec = svld1_vnum(SV_TRUE(), x_ptr, 0); + + SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr, 0); + SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr, 0); + SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr, 0); + + a00_vec = svmla_x(pg00, a00_vec, temp0_vec, x0_vec); + a01_vec = svmla_x(pg01, a01_vec, temp1_vec, x0_vec); + a02_vec = svmla_x(pg02, a02_vec, temp2_vec, x0_vec); + + svst1_vnum(pg00, a0_ptr, 0, a00_vec); + svst1_vnum(pg01, a1_ptr, 0, a01_vec); + svst1_vnum(pg02, a2_ptr, 0, a02_vec); + + i += sve_size * 1; + x_ptr += sve_size * 1; + a0_ptr += sve_size * 1; + a1_ptr += sve_size * 1; + a2_ptr += sve_size * 1; + } + + if (i < m) { + svbool_t pg0 = SV_WHILE(i + sve_size * 0, m); + + pg00 = svand_z(SV_TRUE(), pg0, pg00); + pg01 = svand_z(SV_TRUE(), pg0, pg01); + pg02 = svand_z(SV_TRUE(), pg0, pg02); + + SV_TYPE x0_vec = svld1_vnum(pg0, x_ptr, 0); + + SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr, 0); + SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr, 0); + SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr, 0); + + a00_vec = svmla_x(pg00, a00_vec, temp0_vec, x0_vec); + a01_vec = svmla_x(pg01, a01_vec, temp1_vec, x0_vec); + a02_vec = svmla_x(pg02, a02_vec, temp2_vec, x0_vec); + + svst1_vnum(pg00, a0_ptr, 0, a00_vec); + svst1_vnum(pg01, a1_ptr, 0, a01_vec); + svst1_vnum(pg02, a2_ptr, 0, a02_vec); + } + + y0_ptr += incy; + y1_ptr += incy; + y2_ptr += incy; + } + + return 0; +} diff --git a/kernel/arm64/nrm2.S b/kernel/arm64/nrm2.S index 0e5a8eed13..93218ff6e0 100644 --- a/kernel/arm64/nrm2.S +++ b/kernel/arm64/nrm2.S @@ -35,16 +35,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define I x3 #if !defined(DOUBLE) -#define SSQ s0 -#define SCALE s1 -#define REGZERO s5 -#define REGONE s6 -#else +#define SSQF s0 +#endif + #define SSQ d0 #define SCALE d1 #define REGZERO d5 #define REGONE d6 -#endif /******************************************************************************* * Macro definitions @@ -53,22 +50,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 #if !defined(DOUBLE) ldr s4, [X], #4 - fcmp s4, REGZERO - beq 2f /* KERNEL_F1_NEXT_\@ */ - fabs s4, s4 - fcmp SCALE, s4 - bge 1f /* KERNEL_F1_SCALE_GE_X_\@ */ - fdiv s2, SCALE, s4 - fmul s2, s2, s2 - fmul s3, SSQ, s2 - fadd SSQ, REGONE, s3 - fmov SCALE, s4 - b 2f /* KERNEL_F1_NEXT_\@ */ -1: /* KERNEL_F1_SCALE_GE_X_\@: */ - fdiv s2, s4, SCALE - fmla SSQ, s2, v2.s[0] + fcvt d4, s4 #else ldr d4, [X], #8 +#endif fcmp d4, REGZERO beq 2f /* KERNEL_F1_NEXT_\@ */ fabs d4, d4 @@ -83,29 +68,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 1: /* KERNEL_F1_SCALE_GE_X_\@: */ fdiv d2, d4, SCALE fmla SSQ, d2, v2.d[0] -#endif 2: /* KERNEL_F1_NEXT_\@: */ .endm .macro KERNEL_S1 #if !defined(DOUBLE) ldr s4, [X] - fcmp s4, REGZERO - beq KERNEL_S1_NEXT - fabs s4, s4 - fcmp SCALE, s4 - bge KERNEL_S1_SCALE_GE_X - fdiv s2, SCALE, s4 - fmul s2, s2, s2 - fmul s3, SSQ, s2 - fadd SSQ, REGONE, s3 - fmov SCALE, s4 - b KERNEL_S1_NEXT -KERNEL_S1_SCALE_GE_X: - fdiv s2, s4, SCALE - fmla SSQ, s2, v2.s[0] + fcvt d4, s4 #else ldr d4, [X] +#endif fcmp d4, REGZERO beq KERNEL_S1_NEXT fabs d4, d4 @@ -120,7 +92,6 @@ KERNEL_S1_SCALE_GE_X: KERNEL_S1_SCALE_GE_X: fdiv d2, d4, SCALE fmla SSQ, d2, v2.d[0] -#endif KERNEL_S1_NEXT: add X, X, INC_X .endm @@ -218,7 +189,9 @@ KERNEL_S1_NEXT: .Lnrm2_kernel_L999: fsqrt SSQ, SSQ fmul SSQ, SCALE, SSQ - +#if !defined(DOUBLE) + fcvt SSQF, SSQ +#endif ret EPILOGUE diff --git a/kernel/arm64/sbgemm_kernel_4x4_neoversev1_impl.c b/kernel/arm64/sbgemm_kernel_4x4_neoversev1_impl.c deleted file mode 100644 index b6d9e9816c..0000000000 --- a/kernel/arm64/sbgemm_kernel_4x4_neoversev1_impl.c +++ /dev/null @@ -1,414 +0,0 @@ -/*************************************************************************** - * Copyright (c) 2024-2025, The OpenBLAS Project - * All rights reserved. - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * 3. Neither the name of the OpenBLAS project nor the names of - * its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * *****************************************************************************/ - -#include - -#include "common.h" - -#define INIT_C(M, N) mc##M##N = svdup_f32(0); - -#define MATMUL(M, N) mc##M##N = svbfmmla(mc##M##N, ma##M, mb##N); - -#define INIT_C_4x4 \ - do { \ - INIT_C(0, 0); \ - INIT_C(0, 1); \ - INIT_C(1, 0); \ - INIT_C(1, 1); \ - } while (0); - -#ifdef ALPHA_ONE -#define UPDATE_C(PG, PTR, DST, SRC) \ - do { \ - DST = svld1_f32((PG), (PTR)); \ - DST = svadd_z((PG), SRC, DST); \ - svst1_f32((PG), (PTR), DST); \ - } while (0); -#else -#define UPDATE_C(PG, PTR, DST, SRC) \ - do { \ - DST = svld1_f32((PG), (PTR)); \ - DST = svmad_z((PG), svalpha, SRC, DST); \ - svst1_f32((PG), (PTR), DST); \ - } while (0); -#endif - -#define ZIP_EVEN_ELEMENTS(PG, mc0, mc1, tmp, vc) \ - do { \ - (tmp) = svuzp1_f32((mc0), (mc1)); \ - (vc) = svcompact_f32((PG), (tmp)); \ - } while (0) - -#define ZIP_ODD_ELEMENTS(PG, mc0, mc1, tmp, vc) \ - do { \ - (tmp) = svuzp2_f32((mc0), (mc1)); \ - (vc) = svcompact_f32((PG), (tmp)); \ - } while (0) - -#define ACCUMULATE_LAST4_TO_FIRST4(M, N, TMP) \ - do { \ - TMP = svext_f32(mc##M##N, mc##M##N, 4); \ - mc##M##N = svadd_f32_z(svptrue_b32(), mc##M##N, (TMP)); \ - } while (0) - -#ifdef ALPHA_ONE -int sbgemm_kernel_neoversev1_alpha_one(BLASLONG m, BLASLONG n, BLASLONG k, - FLOAT alpha, IFLOAT *A, IFLOAT *B, - FLOAT *C, BLASLONG ldc) -#else -int sbgemm_kernel_neoversev1_alpha(BLASLONG m, BLASLONG n, BLASLONG k, - FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, - BLASLONG ldc) -#endif -{ - - BLASLONG pad_k = (k + 7) & ~7; - svbfloat16_t ma0, ma1, mb0, mb1; - svfloat32_t mc00, mc01, mc10, mc11, vc0, vc1, vc2, vc3, oc0, oc1, oc2, oc3; - svfloat32_t tmp; - svfloat32_t svalpha = svdup_f32(alpha); - - svbool_t pg16_all = svptrue_b16(); - - svbool_t pg32_first_1 = svwhilelt_b32(0, 1); - svbool_t pg32_first_2 = svwhilelt_b32(0, 2); - svbool_t pg32_first_4 = svwhilelt_b32(0, 4); - - svbool_t pg32_select_first_2_per_quadword = svdupq_b32(1, 1, 0, 0); - - bfloat16_t *ptr_a = (bfloat16_t *)A; - bfloat16_t *ptr_b = (bfloat16_t *)B; - FLOAT *ptr_c = C; - - bfloat16_t *ptr_a0; - bfloat16_t *ptr_b0; - FLOAT *ptr_c0, *ptr_c1, *ptr_c2, *ptr_c3; - - for (BLASLONG j = 0; j < n / 4; j++) { - ptr_c0 = ptr_c; - ptr_c1 = ptr_c0 + ldc; - ptr_c2 = ptr_c1 + ldc; - ptr_c3 = ptr_c2 + ldc; - ptr_c += 4 * ldc; - ptr_a = (bfloat16_t *)A; - - for (BLASLONG i = 0; i < m / 4; i++) { - ptr_a0 = ptr_a; - ptr_a += 4 * pad_k; - - ptr_b0 = ptr_b; - - INIT_C_4x4; - - for (BLASLONG p = 0; p < pad_k; p += 8) { - ma0 = svld1_bf16(pg16_all, ptr_a0); - ma1 = svld1_bf16(pg16_all, ptr_a0 + 16); - - mb0 = svld1_bf16(pg16_all, ptr_b0); - mb1 = svld1_bf16(pg16_all, ptr_b0 + 16); - - MATMUL(0, 0); - MATMUL(0, 1); - MATMUL(1, 0); - MATMUL(1, 1); - - ptr_a0 += 32; - ptr_b0 += 32; - } - - ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); - ACCUMULATE_LAST4_TO_FIRST4(0, 1, tmp); - ACCUMULATE_LAST4_TO_FIRST4(1, 0, tmp); - ACCUMULATE_LAST4_TO_FIRST4(1, 1, tmp); - - ZIP_EVEN_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc0); - ZIP_ODD_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc1); - - ZIP_EVEN_ELEMENTS(pg32_select_first_2_per_quadword, mc01, mc11, tmp, vc2); - ZIP_ODD_ELEMENTS(pg32_select_first_2_per_quadword, mc01, mc11, tmp, vc3); - - UPDATE_C(pg32_first_4, ptr_c0, oc0, vc0); - UPDATE_C(pg32_first_4, ptr_c1, oc1, vc1); - UPDATE_C(pg32_first_4, ptr_c2, oc2, vc2) - UPDATE_C(pg32_first_4, ptr_c3, oc3, vc3) - - ptr_c0 += 4; - ptr_c1 += 4; - ptr_c2 += 4; - ptr_c3 += 4; - } - - if (m & 2) { - ptr_a0 = ptr_a; - ptr_a += 2 * pad_k; - - ptr_b0 = ptr_b; - INIT_C(0, 0); - INIT_C(0, 1); - for (BLASLONG p = 0; p < pad_k; p += 8) { - ma0 = svld1_bf16(pg16_all, ptr_a0); - mb0 = svld1_bf16(pg16_all, ptr_b0); - mb1 = svld1_bf16(pg16_all, ptr_b0 + 16); - - MATMUL(0, 0); - MATMUL(0, 1); - - ptr_a0 += 16; - ptr_b0 += 32; - } - - ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); - ACCUMULATE_LAST4_TO_FIRST4(0, 1, tmp); - - vc0 = svuzp1(mc00, mc00); - vc1 = svuzp2(mc00, mc00); - vc2 = svuzp1(mc01, mc01); - vc3 = svuzp2(mc01, mc01); - - UPDATE_C(pg32_first_2, ptr_c0, oc0, vc0); - UPDATE_C(pg32_first_2, ptr_c1, oc1, vc1); - UPDATE_C(pg32_first_2, ptr_c2, oc2, vc2); - UPDATE_C(pg32_first_2, ptr_c3, oc3, vc3); - - ptr_c0 += 2; - ptr_c1 += 2; - ptr_c2 += 2; - ptr_c3 += 2; - } - - if (m & 1) { - ptr_a0 = ptr_a; - ptr_b0 = ptr_b; - - INIT_C(0, 0); - INIT_C(0, 1); - for (BLASLONG p = 0; p < pad_k; p += 8) { - ma0 = svld1_bf16(pg16_all, ptr_a0); - mb0 = svld1_bf16(pg16_all, ptr_b0); - mb1 = svld1_bf16(pg16_all, ptr_b0 + 16); - - MATMUL(0, 0); - MATMUL(0, 1); - - ptr_a0 += 16; - ptr_b0 += 32; - } - - ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); - ACCUMULATE_LAST4_TO_FIRST4(0, 1, tmp); - - // use compact is more straightforward - vc1 = svuzp2(mc00, mc00); - vc3 = svuzp2(mc01, mc01); - - UPDATE_C(pg32_first_1, ptr_c0, oc0, mc00); - UPDATE_C(pg32_first_1, ptr_c1, oc1, vc1); - UPDATE_C(pg32_first_1, ptr_c2, oc2, mc01); - UPDATE_C(pg32_first_1, ptr_c3, oc3, vc3); - } - - ptr_b += 4 * pad_k; - } - - if (n & 2) { - ptr_c0 = ptr_c; - ptr_c1 = ptr_c0 + ldc; - ptr_c += 2 * ldc; - ptr_a = (bfloat16_t *)A; - - for (BLASLONG i = 0; i < m / 4; i++) { - ptr_a0 = ptr_a; - ptr_a += 4 * pad_k; - - ptr_b0 = ptr_b; - - INIT_C(0, 0); - INIT_C(1, 0); - - for (BLASLONG p = 0; p < pad_k; p += 8) { - ma0 = svld1_bf16(pg16_all, ptr_a0); - ma1 = svld1_bf16(pg16_all, ptr_a0 + 16); - - mb0 = svld1_bf16(pg16_all, ptr_b0); - - MATMUL(0, 0); - MATMUL(1, 0); - - ptr_a0 += 32; - ptr_b0 += 16; - } - - ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); - ACCUMULATE_LAST4_TO_FIRST4(1, 0, tmp); - - ZIP_EVEN_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc0); - ZIP_ODD_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc2); - - UPDATE_C(pg32_first_4, ptr_c0, oc0, vc0); - UPDATE_C(pg32_first_4, ptr_c1, oc2, vc2); - - ptr_c0 += 4; - ptr_c1 += 4; - } - - if (m & 2) { - ptr_a0 = ptr_a; - ptr_a += 2 * pad_k; - ptr_b0 = ptr_b; - - INIT_C(0, 0); - - for (BLASLONG p = 0; p < pad_k; p += 8) { - ma0 = svld1_bf16(pg16_all, ptr_a0); - mb0 = svld1_bf16(pg16_all, ptr_b0); - - MATMUL(0, 0); - - ptr_a0 += 16; - ptr_b0 += 16; - } - - ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); - vc0 = svuzp1(mc00, mc00); - vc1 = svuzp2(mc00, mc00); - - UPDATE_C(pg32_first_2, ptr_c0, oc0, vc0); - UPDATE_C(pg32_first_2, ptr_c1, oc1, vc1); - - ptr_c0 += 2; - ptr_c1 += 2; - } - - if (m & 1) { - ptr_a0 = ptr_a; - ptr_b0 = ptr_b; - INIT_C(0, 0); - for (BLASLONG p = 0; p < pad_k; p += 8) { - ma0 = svld1_bf16(pg16_all, ptr_a0); - mb0 = svld1_bf16(pg16_all, ptr_b0); - MATMUL(0, 0); - ptr_a0 += 16; - ptr_b0 += 16; - } - - ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); - vc1 = svuzp2(mc00, mc00); - - UPDATE_C(pg32_first_1, ptr_c0, oc0, mc00); - UPDATE_C(pg32_first_1, ptr_c1, oc1, vc1); - } - - ptr_b += 2 * pad_k; - } - - if (n & 1) { // TODO: this case seems a overhead. find out whether it's in our - // case. - ptr_c0 = ptr_c; - ptr_a = (bfloat16_t *)A; - - for (BLASLONG i = 0; i < m / 4; i++) { - ptr_a0 = ptr_a; - ptr_a += 4 * pad_k; - - ptr_b0 = ptr_b; - - INIT_C(0, 0); - INIT_C(1, 0); - - for (BLASLONG p = 0; p < pad_k; p += 8) { - ma0 = svld1_bf16(pg16_all, ptr_a0); - ma1 = svld1_bf16(pg16_all, ptr_a0 + 16); - - mb0 = svld1_bf16(pg16_all, ptr_b0); - - MATMUL(0, 0); - MATMUL(1, 0); - - ptr_a0 += 32; - ptr_b0 += 16; - } - - ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); - ACCUMULATE_LAST4_TO_FIRST4(1, 0, tmp); - - ZIP_EVEN_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc0); - - UPDATE_C(pg32_first_4, ptr_c0, oc0, vc0); - - ptr_c0 += 4; - } - - if (m & 2) { - ptr_a0 = ptr_a; - ptr_a += 2 * pad_k; - ptr_b0 = ptr_b; - - INIT_C(0, 0); - - for (BLASLONG p = 0; p < pad_k; p += 8) { - ma0 = svld1_bf16(pg16_all, ptr_a0); - mb0 = svld1_bf16(pg16_all, ptr_b0); - - MATMUL(0, 0); - - ptr_a0 += 16; - ptr_b0 += 16; - } - - ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); - - vc0 = svuzp1(mc00, mc00); - - UPDATE_C(pg32_first_2, ptr_c0, oc0, vc0); - - ptr_c0 += 2; - } - - if (m & 1) { - ptr_a0 = ptr_a; - ptr_b0 = ptr_b; - - INIT_C(0, 0); - for (BLASLONG p = 0; p < pad_k; p += 8) { - - ma0 = svld1_bf16(pg16_all, ptr_a0); - mb0 = svld1_bf16(pg16_all, ptr_b0); - - MATMUL(0, 0); - ptr_a0 += 16; - ptr_b0 += 16; - } - - ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); - - UPDATE_C(pg32_first_1, ptr_c0, oc0, mc00); - } - } - - return 0; -} diff --git a/kernel/arm64/sbgemm_kernel_8x4_neoversen2.c b/kernel/arm64/sbgemm_kernel_8x4_neoversen2.c index 4c1385fbed..60634aa2ee 100644 --- a/kernel/arm64/sbgemm_kernel_8x4_neoversen2.c +++ b/kernel/arm64/sbgemm_kernel_8x4_neoversen2.c @@ -1,5 +1,5 @@ /*************************************************************************** - * Copyright (c) 2022, The OpenBLAS Project + * Copyright (c) 2022,2025 The OpenBLAS Project * All rights reserved. * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are @@ -27,19 +27,30 @@ * *****************************************************************************/ #include +#include #include "common.h" #define ALPHA_ONE #include "sbgemm_kernel_8x4_neoversen2_impl.c" #undef ALPHA_ONE +#undef UPDATE_C #include "sbgemm_kernel_8x4_neoversen2_impl.c" int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, BLASLONG ldc) { - if (alpha == 1.0f) - return sbgemm_kernel_neoversen2_alpha_one(m, n, k, alpha, A, B, C, ldc); +#ifdef BGEMM + bfloat16_t alpha_bf16; + memcpy(&alpha_bf16, &alpha, sizeof(bfloat16_t)); + float alpha_f32 = vcvtah_f32_bf16(alpha_bf16); +#else + float alpha_f32 = alpha; +#endif + + if (alpha_f32 == 1.0f) + return gemm_kernel_neoversen2_alpha_one(m, n, k, alpha, A, B, C, ldc); else - return sbgemm_kernel_neoversen2_alpha(m, n, k, alpha, A, B, C, ldc); + return gemm_kernel_neoversen2_alpha(m, n, k, alpha, A, B, C, ldc); + return 0; } diff --git a/kernel/arm64/sbgemm_kernel_8x4_neoversen2_impl.c b/kernel/arm64/sbgemm_kernel_8x4_neoversen2_impl.c index 26ea7ee61b..d4e0a38afe 100644 --- a/kernel/arm64/sbgemm_kernel_8x4_neoversen2_impl.c +++ b/kernel/arm64/sbgemm_kernel_8x4_neoversen2_impl.c @@ -1,5 +1,5 @@ /*************************************************************************** - * Copyright (c) 2022, The OpenBLAS Project + * Copyright (c) 2022,2025 The OpenBLAS Project * All rights reserved. * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are @@ -27,6 +27,7 @@ * *****************************************************************************/ #include +#include #include "common.h" @@ -46,49 +47,97 @@ INIT_C(3, 1); \ } while (0); +#ifdef BGEMM #ifdef ALPHA_ONE -#define UPDATE_C(PG, PTR, DST, SRC) \ - do { \ - DST = svld1_f32((PG), (PTR)); \ - DST = svadd_z((PG), SRC, DST); \ - svst1_f32((PG), (PTR), DST); \ +#define UPDATE_C(PG16, PG32, PTR, SRC) \ + do { \ + tmp16 = svld1_bf16((PG16), (PTR)); \ + tmp32 = svreinterpret_f32(svzip1_bf16(zeros, tmp16)); \ + tmp32 = svadd_z((PG32), SRC, tmp32); \ + tmp16 = svcvt_bf16_f32_z((PG32), tmp32); \ + tmp16 = svuzp1_bf16(tmp16, tmp16); \ + svst1_bf16((PG16), (PTR), tmp16); \ + } while (0) +#else +#define UPDATE_C(PG16, PG32, PTR, SRC) \ + do { \ + tmp16 = svld1_bf16((PG16), (PTR)); \ + tmp32 = svreinterpret_f32(svzip1_bf16(zeros, tmp16)); \ + tmp32 = svmad_z((PG32), svalpha, SRC, tmp32); \ + tmp16 = svcvt_bf16_f32_z((PG32), tmp32); \ + tmp16 = svuzp1_bf16(tmp16, tmp16); \ + svst1_bf16((PG16), (PTR), tmp16); \ + } while (0) +#endif +#else +#ifdef ALPHA_ONE +#define UPDATE_C(PG16, PG32, PTR, SRC) \ + do { \ + tmp32 = svld1_f32((PG32), (PTR)); \ + tmp32 = svadd_z((PG32), SRC, tmp32); \ + svst1_f32((PG32), (PTR), tmp32); \ } while (0); #else -#define UPDATE_C(PG, PTR, DST, SRC) \ - do { \ - DST = svld1_f32((PG), (PTR)); \ - DST = svmad_z((PG), svalpha, SRC, DST); \ - svst1_f32((PG), (PTR), DST); \ +#define UPDATE_C(PG16, PG32, PTR, SRC) \ + do { \ + tmp32 = svld1_f32((PG32), (PTR)); \ + tmp32 = svmad_z((PG32), svalpha, SRC, tmp32); \ + svst1_f32((PG32), (PTR), tmp32); \ } while (0); + #endif + #endif + +#ifdef BGEMM +#define OUTPUT_FLOAT bfloat16_t +#else +#define OUTPUT_FLOAT float #endif #ifdef ALPHA_ONE -int sbgemm_kernel_neoversen2_alpha_one(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) +static int gemm_kernel_neoversen2_alpha_one(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) #else -int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) +static int gemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) #endif { BLASLONG pad_k = (k + 3) & ~3; svbfloat16_t ma0, ma1, ma2, ma3, mb0, mb1; svfloat32_t mc00, mc01, mc10, mc11, mc20, mc21, mc30, mc31, - vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7, - oc0, oc1, oc2, oc3, oc4, oc5, oc6, oc7; + vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; + +#ifndef ALPHA_ONE +#ifdef BGEMM + bfloat16_t alpha_bf16; + memcpy(&alpha_bf16, &alpha, sizeof(bfloat16_t)); + svfloat32_t svalpha = svdup_f32(vcvtah_f32_bf16(alpha_bf16)); +#else svfloat32_t svalpha = svdup_f32(alpha); +#endif +#endif - svbool_t pg16 = svptrue_b16(); - svbool_t pg16_low = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0); - svbool_t pg32 = svptrue_b32(); - svbool_t pg32_low = svdupq_b32(1, 1, 0, 0); - svbool_t pg32_first = svdupq_b32(1, 0, 0, 0); + svbool_t pg32_first_4 = svdupq_b32(1, 1, 1, 1); + svbool_t pg32_first_2 = svdupq_b32(1, 1, 0, 0); + svbool_t pg32_first_1 = svdupq_b32(1, 0, 0, 0); + svbool_t pg16_first_8 = svdupq_b16(1, 1, 1, 1, 1, 1, 1, 1); + svbool_t pg16_first_4 = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0); +#ifdef BGEMM + svbool_t pg16_first_2 = svdupq_b16(1, 1, 0, 0, 0, 0, 0, 0); + svbool_t pg16_first_1 = svdupq_b16(1, 0, 0, 0, 0, 0, 0, 0); + svbfloat16_t zeros = svdup_n_bf16(vcvth_bf16_f32(0.0)); +#endif bfloat16_t *ptr_a = (bfloat16_t *)A; bfloat16_t *ptr_b = (bfloat16_t *)B; - FLOAT *ptr_c = C; + OUTPUT_FLOAT *ptr_c = (OUTPUT_FLOAT*)C; + + bfloat16_t *ptr_a0; + bfloat16_t *ptr_b0; + OUTPUT_FLOAT *ptr_c0, *ptr_c1, *ptr_c2, *ptr_c3; - bfloat16_t *ptr_a0, *ptr_a1, *ptr_a2, *ptr_a3; - bfloat16_t *ptr_b0, *ptr_b1; - FLOAT *ptr_c0, *ptr_c1, *ptr_c2, *ptr_c3; + svfloat32_t tmp32; +#ifdef BGEMM + svbfloat16_t tmp16; +#endif for (BLASLONG j = 0; j < n / 4; j++) { ptr_c0 = ptr_c; @@ -107,13 +156,13 @@ int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alp INIT_C_8x4; for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16, ptr_a0); - ma1 = svld1_bf16(pg16, ptr_a0 + 8); - ma2 = svld1_bf16(pg16, ptr_a0 + 16); - ma3 = svld1_bf16(pg16, ptr_a0 + 24); + ma0 = svld1_bf16(pg16_first_8, ptr_a0); + ma1 = svld1_bf16(pg16_first_8, ptr_a0 + 8); + ma2 = svld1_bf16(pg16_first_8, ptr_a0 + 16); + ma3 = svld1_bf16(pg16_first_8, ptr_a0 + 24); - mb0 = svld1_bf16(pg16, ptr_b0); - mb1 = svld1_bf16(pg16, ptr_b0 + 8); + mb0 = svld1_bf16(pg16_first_8, ptr_b0); + mb1 = svld1_bf16(pg16_first_8, ptr_b0 + 8); MATMUL(0, 0); MATMUL(0, 1); MATMUL(1, 0); MATMUL(1, 1); @@ -133,14 +182,14 @@ int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alp vc6 = svuzp2(mc01, mc11); vc7 = svuzp2(mc21, mc31); - UPDATE_C(pg32, ptr_c0, oc0, vc0); - UPDATE_C(pg32, ptr_c0+4, oc1, vc1); - UPDATE_C(pg32, ptr_c1, oc2, vc2); - UPDATE_C(pg32, ptr_c1+4, oc3, vc3); - UPDATE_C(pg32, ptr_c2, oc4, vc4) - UPDATE_C(pg32, ptr_c2+4, oc5, vc5); - UPDATE_C(pg32, ptr_c3, oc6, vc6) - UPDATE_C(pg32, ptr_c3+4, oc7, vc7); + UPDATE_C(pg16_first_4, pg32_first_4, ptr_c0, vc0); + UPDATE_C(pg16_first_4, pg32_first_4, ptr_c0+4, vc1); + UPDATE_C(pg16_first_4, pg32_first_4, ptr_c1, vc2); + UPDATE_C(pg16_first_4, pg32_first_4, ptr_c1+4, vc3); + UPDATE_C(pg16_first_4, pg32_first_4, ptr_c2, vc4); + UPDATE_C(pg16_first_4, pg32_first_4, ptr_c2+4, vc5); + UPDATE_C(pg16_first_4, pg32_first_4, ptr_c3, vc6); + UPDATE_C(pg16_first_4, pg32_first_4, ptr_c3+4, vc7); ptr_c0 += 8; ptr_c1 += 8; @@ -157,10 +206,10 @@ int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alp INIT_C(1, 0); INIT_C(1, 1); for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16, ptr_a0); - ma1 = svld1_bf16(pg16, ptr_a0 + 8); - mb0 = svld1_bf16(pg16, ptr_b0); - mb1 = svld1_bf16(pg16, ptr_b0 + 8); + ma0 = svld1_bf16(pg16_first_8, ptr_a0); + ma1 = svld1_bf16(pg16_first_8, ptr_a0 + 8); + mb0 = svld1_bf16(pg16_first_8, ptr_b0); + mb1 = svld1_bf16(pg16_first_8, ptr_b0 + 8); MATMUL(0, 0); MATMUL(0, 1); MATMUL(1, 0); MATMUL(1, 1); @@ -174,10 +223,10 @@ int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alp vc2 = svuzp1(mc01, mc11); vc3 = svuzp2(mc01, mc11); - UPDATE_C(pg32, ptr_c0, oc0, vc0); - UPDATE_C(pg32, ptr_c1, oc1, vc1); - UPDATE_C(pg32, ptr_c2, oc2, vc2); - UPDATE_C(pg32, ptr_c3, oc3, vc3); + UPDATE_C(pg16_first_4, pg32_first_4, ptr_c0, vc0); + UPDATE_C(pg16_first_4, pg32_first_4, ptr_c1, vc1); + UPDATE_C(pg16_first_4, pg32_first_4, ptr_c2, vc2); + UPDATE_C(pg16_first_4, pg32_first_4, ptr_c3, vc3); ptr_c0 += 4; ptr_c1 += 4; @@ -192,9 +241,9 @@ int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alp INIT_C(0, 0); INIT_C(0, 1); for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16, ptr_a0); - mb0 = svld1_bf16(pg16, ptr_b0); - mb1 = svld1_bf16(pg16, ptr_b0 + 8); + ma0 = svld1_bf16(pg16_first_8, ptr_a0); + mb0 = svld1_bf16(pg16_first_8, ptr_b0); + mb1 = svld1_bf16(pg16_first_8, ptr_b0 + 8); MATMUL(0, 0); MATMUL(0, 1); @@ -207,10 +256,10 @@ int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alp vc2 = svuzp1(mc01, mc01); vc3 = svuzp2(mc01, mc01); - UPDATE_C(pg32_low, ptr_c0, oc0, vc0); - UPDATE_C(pg32_low, ptr_c1, oc1, vc1); - UPDATE_C(pg32_low, ptr_c2, oc2, vc2); - UPDATE_C(pg32_low, ptr_c3, oc3, vc3); + UPDATE_C(pg16_first_2, pg32_first_2, ptr_c0, vc0); + UPDATE_C(pg16_first_2, pg32_first_2, ptr_c1, vc1); + UPDATE_C(pg16_first_2, pg32_first_2, ptr_c2, vc2); + UPDATE_C(pg16_first_2, pg32_first_2, ptr_c3, vc3); ptr_c0 += 2; ptr_c1 += 2; @@ -224,9 +273,9 @@ int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alp INIT_C(0, 0); INIT_C(0, 1); for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16_low, ptr_a0); - mb0 = svld1_bf16(pg16, ptr_b0); - mb1 = svld1_bf16(pg16, ptr_b0 + 8); + ma0 = svld1_bf16(pg16_first_4, ptr_a0); + mb0 = svld1_bf16(pg16_first_8, ptr_b0); + mb1 = svld1_bf16(pg16_first_8, ptr_b0 + 8); MATMUL(0, 0); MATMUL(0, 1); @@ -237,10 +286,10 @@ int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alp vc1 = svuzp2(mc00, mc00); vc3 = svuzp2(mc01, mc01); - UPDATE_C(pg32_first, ptr_c0, oc0, mc00); - UPDATE_C(pg32_first, ptr_c1, oc1, vc1); - UPDATE_C(pg32_first, ptr_c2, oc2, mc01); - UPDATE_C(pg32_first, ptr_c3, oc3, vc3); + UPDATE_C(pg16_first_1, pg32_first_1, ptr_c0, mc00); + UPDATE_C(pg16_first_1, pg32_first_1, ptr_c1, vc1); + UPDATE_C(pg16_first_1, pg32_first_1, ptr_c2, mc01); + UPDATE_C(pg16_first_1, pg32_first_1, ptr_c3, vc3); } @@ -265,12 +314,12 @@ int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alp INIT_C(3, 0); for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16, ptr_a0); - ma1 = svld1_bf16(pg16, ptr_a0 + 8); - ma2 = svld1_bf16(pg16, ptr_a0 + 16); - ma3 = svld1_bf16(pg16, ptr_a0 + 24); + ma0 = svld1_bf16(pg16_first_8, ptr_a0); + ma1 = svld1_bf16(pg16_first_8, ptr_a0 + 8); + ma2 = svld1_bf16(pg16_first_8, ptr_a0 + 16); + ma3 = svld1_bf16(pg16_first_8, ptr_a0 + 24); - mb0 = svld1_bf16(pg16, ptr_b0); + mb0 = svld1_bf16(pg16_first_8, ptr_b0); MATMUL(0, 0); MATMUL(1, 0); @@ -286,10 +335,10 @@ int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alp vc2 = svuzp2(mc00, mc10); vc3 = svuzp2(mc20, mc30); - UPDATE_C(pg32, ptr_c0, oc0, vc0); - UPDATE_C(pg32, ptr_c0 + 4, oc1, vc1); - UPDATE_C(pg32, ptr_c1, oc2, vc2); - UPDATE_C(pg32, ptr_c1 + 4, oc3, vc3); + UPDATE_C(pg16_first_4, pg32_first_4, ptr_c0, vc0); + UPDATE_C(pg16_first_4, pg32_first_4, ptr_c0 + 4, vc1); + UPDATE_C(pg16_first_4, pg32_first_4, ptr_c1, vc2); + UPDATE_C(pg16_first_4, pg32_first_4, ptr_c1 + 4, vc3); ptr_c0 += 8; ptr_c1 += 8; @@ -304,9 +353,9 @@ int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alp INIT_C(1, 0); for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16, ptr_a0); - ma1 = svld1_bf16(pg16, ptr_a0 + 8); - mb0 = svld1_bf16(pg16, ptr_b0); + ma0 = svld1_bf16(pg16_first_8, ptr_a0); + ma1 = svld1_bf16(pg16_first_8, ptr_a0 + 8); + mb0 = svld1_bf16(pg16_first_8, ptr_b0); MATMUL(0, 0); MATMUL(1, 0); ptr_a0 += 16; @@ -316,8 +365,8 @@ int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alp vc0 = svuzp1(mc00, mc10); vc1 = svuzp2(mc00, mc10); - UPDATE_C(pg32, ptr_c0, oc0, vc0); - UPDATE_C(pg32, ptr_c1, oc1, vc1); + UPDATE_C(pg16_first_4, pg32_first_4, ptr_c0, vc0); + UPDATE_C(pg16_first_4, pg32_first_4, ptr_c1, vc1); ptr_c0 += 4; ptr_c1 += 4; @@ -331,8 +380,8 @@ int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alp INIT_C(0, 0); for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16, ptr_a0); - mb0 = svld1_bf16(pg16, ptr_b0); + ma0 = svld1_bf16(pg16_first_8, ptr_a0); + mb0 = svld1_bf16(pg16_first_8, ptr_b0); MATMUL(0, 0); @@ -342,8 +391,8 @@ int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alp vc0 = svuzp1(mc00, mc00); vc1 = svuzp2(mc00, mc00); - UPDATE_C(pg32_low, ptr_c0, oc0, vc0); - UPDATE_C(pg32_low, ptr_c1, oc1, vc1); + UPDATE_C(pg16_first_2, pg32_first_2, ptr_c0, vc0); + UPDATE_C(pg16_first_2, pg32_first_2, ptr_c1, vc1); ptr_c0 += 2; ptr_c1 += 2; @@ -355,16 +404,16 @@ int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alp ptr_b0 = ptr_b; INIT_C(0, 0); for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16_low, ptr_a0); - mb0 = svld1_bf16(pg16, ptr_b0); + ma0 = svld1_bf16(pg16_first_4, ptr_a0); + mb0 = svld1_bf16(pg16_first_8, ptr_b0); MATMUL(0, 0); ptr_a0 += 4; ptr_b0 += 8; } vc1 = svuzp2(mc00, mc00); - UPDATE_C(pg32_first, ptr_c0, oc0, mc00); - UPDATE_C(pg32_first, ptr_c1, oc1, vc1); + UPDATE_C(pg16_first_1, pg32_first_1, ptr_c0, mc00); + UPDATE_C(pg16_first_1, pg32_first_1, ptr_c1, vc1); } ptr_b += 2 * pad_k; @@ -386,12 +435,12 @@ int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alp INIT_C(3, 0); for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16, ptr_a0); - ma1 = svld1_bf16(pg16, ptr_a0 + 8); - ma2 = svld1_bf16(pg16, ptr_a0 + 16); - ma3 = svld1_bf16(pg16, ptr_a0 + 24); + ma0 = svld1_bf16(pg16_first_8, ptr_a0); + ma1 = svld1_bf16(pg16_first_8, ptr_a0 + 8); + ma2 = svld1_bf16(pg16_first_8, ptr_a0 + 16); + ma3 = svld1_bf16(pg16_first_8, ptr_a0 + 24); - mb0 = svld1_bf16(pg16_low, ptr_b0); + mb0 = svld1_bf16(pg16_first_4, ptr_b0); MATMUL(0, 0); MATMUL(1, 0); @@ -405,8 +454,8 @@ int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alp vc0 = svuzp1(mc00, mc10); vc1 = svuzp1(mc20, mc30); - UPDATE_C(pg32, ptr_c0, oc0, vc0); - UPDATE_C(pg32, ptr_c0 + 4, oc1, vc1); + UPDATE_C(pg16_first_4, pg32_first_4, ptr_c0, vc0); + UPDATE_C(pg16_first_4, pg32_first_4, ptr_c0 + 4, vc1); ptr_c0 += 8; } @@ -418,16 +467,16 @@ int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alp INIT_C(0, 0); INIT_C(1, 0); for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16, ptr_a0); - ma1 = svld1_bf16(pg16, ptr_a0 + 8); - mb0 = svld1_bf16(pg16_low, ptr_b0); + ma0 = svld1_bf16(pg16_first_8, ptr_a0); + ma1 = svld1_bf16(pg16_first_8, ptr_a0 + 8); + mb0 = svld1_bf16(pg16_first_4, ptr_b0); MATMUL(0, 0); MATMUL(1, 0); ptr_a0 += 16; ptr_b0 += 4; } vc0 = svuzp1(mc00, mc10); - UPDATE_C(pg32, ptr_c0, oc0, vc0); + UPDATE_C(pg16_first_4, pg32_first_4, ptr_c0, vc0); ptr_c0 += 4; } @@ -439,8 +488,8 @@ int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alp INIT_C(0, 0); for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16, ptr_a0); - mb0 = svld1_bf16(pg16_low, ptr_b0); + ma0 = svld1_bf16(pg16_first_8, ptr_a0); + mb0 = svld1_bf16(pg16_first_4, ptr_b0); MATMUL(0, 0); @@ -448,7 +497,7 @@ int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alp ptr_b0 += 4; } vc0 = svuzp1(mc00, mc00); - UPDATE_C(pg32_low, ptr_c0, oc0, vc0); + UPDATE_C(pg16_first_2, pg32_first_2, ptr_c0, vc0); ptr_c0 += 2; } @@ -457,13 +506,13 @@ int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alp ptr_b0 = ptr_b; INIT_C(0, 0); for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16_low, ptr_a0); - mb0 = svld1_bf16(pg16_low, ptr_b0); + ma0 = svld1_bf16(pg16_first_4, ptr_a0); + mb0 = svld1_bf16(pg16_first_4, ptr_b0); MATMUL(0, 0); ptr_a0 += 4; ptr_b0 += 4; } - UPDATE_C(pg32_first, ptr_c0, oc0, mc00); + UPDATE_C(pg16_first_1, pg32_first_1, ptr_c0, mc00); } } diff --git a/kernel/arm64/sbgemm_ncopy_4_neoversev1.c b/kernel/arm64/sbgemm_ncopy_4_neoversev1.c deleted file mode 100644 index 100f5c68ea..0000000000 --- a/kernel/arm64/sbgemm_ncopy_4_neoversev1.c +++ /dev/null @@ -1,148 +0,0 @@ -/*************************************************************************** - * Copyright (c) 2024-2025, The OpenBLAS Project - * All rights reserved. - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * 3. Neither the name of the OpenBLAS project nor the names of - * its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * *****************************************************************************/ - -#include - -#include "common.h" - -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { - IFLOAT *a_offset; - IFLOAT *a_offsetx[4]; - IFLOAT *b_offset; - a_offset = a; - b_offset = b; - - bfloat16_t zero_value_bf16; - *((uint16_t *)(&zero_value_bf16)) = 0; - - svbool_t pg16_all = svptrue_b16(); // 16 elements for sve-256 machine. - svbool_t pg16_first_8 = svwhilelt_b16(0, 8); - - svbfloat16_t v0, v1, v2, v3; - svuint64_t t0, t1; - - BLASLONG rest = m & 7; - svbool_t pg16_rest = svwhilelt_b16_s32(0, rest); - - for (BLASLONG j = 0; j < n / 4; j++) { - a_offsetx[0] = a_offset; - a_offsetx[1] = a_offsetx[0] + lda; - a_offsetx[2] = a_offsetx[1] + lda; - a_offsetx[3] = a_offsetx[2] + lda; - a_offset += 4 * lda; - - for (BLASLONG i = 0; i < m / 8; i++) { - v0 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[0]); - v1 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[1]); - v2 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[2]); - v3 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[3]); - - t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); - t1 = svzip1_u64(svreinterpret_u64_bf16(v2), svreinterpret_u64_bf16(v3)); - - svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); - svst1_bf16(pg16_all, (bfloat16_t *)b_offset + 16, - svreinterpret_bf16_u64(t1)); - - a_offsetx[0] += 8; - a_offsetx[1] += 8; - a_offsetx[2] += 8; - a_offsetx[3] += 8; - - b_offset += 32; - } - - if (rest) { // remainder along k dim - v0 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[0]); - v1 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[1]); - v2 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[2]); - v3 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[3]); - - t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); - t1 = svzip1_u64(svreinterpret_u64_bf16(v2), svreinterpret_u64_bf16(v3)); - - svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); - svst1_bf16(pg16_all, (bfloat16_t *)b_offset + 16, - svreinterpret_bf16_u64(t1)); - - b_offset += 32; - } - } - - if (n & 2) { - a_offsetx[0] = a_offset; - a_offsetx[1] = a_offsetx[0] + lda; - a_offset += 2 * lda; - - for (BLASLONG i = 0; i < m / 8; i++) { - v0 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[0]); - v1 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[1]); - - t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); - svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); - - b_offset += 16; - a_offsetx[0] += 8; - a_offsetx[1] += 8; - } - - if (rest) { // remainder along k dim - v0 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[0]); - v1 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[1]); - - t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); - svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); - - b_offset += 16; - } - } - - if (n & 1) { - a_offsetx[0] = a_offset; - - for (BLASLONG i = 0; i < m / 8; i++) { - v0 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[0]); - v1 = svdup_bf16(zero_value_bf16); - - t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); - svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); - - b_offset += 16; - a_offsetx[0] += 8; - } - - if (rest) { // remainder along k dim - v0 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[0]); - v1 = svdup_bf16(zero_value_bf16); - t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); - svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); - } - } - - return 0; -} diff --git a/kernel/arm64/sbgemm_tcopy_4_neoversev1.c b/kernel/arm64/sbgemm_tcopy_4_neoversev1.c deleted file mode 100644 index 140e8f7edc..0000000000 --- a/kernel/arm64/sbgemm_tcopy_4_neoversev1.c +++ /dev/null @@ -1,361 +0,0 @@ -/*************************************************************************** - * Copyright (c) 2024-2025, The OpenBLAS Project - * All rights reserved. - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * 3. Neither the name of the OpenBLAS project nor the names of - * its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * *****************************************************************************/ -#include "common.h" -#include -#include - -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { - BLASLONG pad_m = ((m + 7) & ~7); - BLASLONG rest = (m & 7); // rest along m dim - - IFLOAT *a_offset; - IFLOAT *a_offset0, *a_offset1, *a_offset2, *a_offset3; - IFLOAT *a_offset4, *a_offset5, *a_offset6, *a_offset7; - - IFLOAT *b_offset; - IFLOAT *b_offset0, *b_offset1; - - a_offset = a; - b_offset = b; - - svuint16_t c0, c1, c2, c3, c4, c5, c6, c7; - svuint16_t t0, t1, t2, t3; - svuint32_t m00, m01, m10, m11; - svuint64_t st_offsets_0, st_offsets_1; - - svbool_t pg16_first_4 = svwhilelt_b16(0, 4); - svbool_t pg16_first_8 = svwhilelt_b16(0, 8); - - svbool_t pg64_first_4 = svwhilelt_b64(0, 4); - - u_int32_t sizeof_u64 = 8; - u_int64_t _st_offsets_0[4] = { - 0 * sizeof_u64, - 1 * sizeof_u64, - 4 * sizeof_u64, - 5 * sizeof_u64, - }; - - u_int64_t _st_offsets_1[4] = { - 2 * sizeof_u64, - 3 * sizeof_u64, - 6 * sizeof_u64, - 7 * sizeof_u64, - }; - - st_offsets_0 = svld1_u64(pg64_first_4, _st_offsets_0); - st_offsets_1 = svld1_u64(pg64_first_4, _st_offsets_1); - - for (BLASLONG j = 0; j < n / 8; j++) { - a_offset0 = a_offset; - a_offset1 = a_offset0 + lda; - a_offset2 = a_offset1 + lda; - a_offset3 = a_offset2 + lda; - a_offset4 = a_offset3 + lda; - a_offset5 = a_offset4 + lda; - a_offset6 = a_offset5 + lda; - a_offset7 = a_offset6 + lda; - a_offset += 8; - - b_offset0 = b_offset; - b_offset1 = b_offset0 + 4 * pad_m; - - b_offset += 8 * pad_m; - for (BLASLONG i = 0; i < m / 8; i++) { - // transpose 8x8 matrix and pack into two 4x8 block consists of two 2x4 - // small blocks - c0 = svld1_u16(pg16_first_8, a_offset0); - c1 = svld1_u16(pg16_first_8, a_offset1); - c2 = svld1_u16(pg16_first_8, a_offset2); - c3 = svld1_u16(pg16_first_8, a_offset3); - c4 = svld1_u16(pg16_first_8, a_offset4); - c5 = svld1_u16(pg16_first_8, a_offset5); - c6 = svld1_u16(pg16_first_8, a_offset6); - c7 = svld1_u16(pg16_first_8, a_offset7); - - t0 = svzip1_u16(c0, c1); - t1 = svzip1_u16(c2, c3); - t2 = svzip1_u16(c4, c5); - t3 = svzip1_u16(c6, c7); - - m00 = svzip1_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); - m10 = svzip2_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); - m01 = svzip1_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); - m11 = svzip2_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); - - svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, - st_offsets_0, svreinterpret_u64_u32(m00)); - svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, - st_offsets_1, svreinterpret_u64_u32(m01)); - svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset1, - st_offsets_0, svreinterpret_u64_u32(m10)); - svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset1, - st_offsets_1, svreinterpret_u64_u32(m11)); - - a_offset0 += 8 * lda; - a_offset1 += 8 * lda; - a_offset2 += 8 * lda; - a_offset3 += 8 * lda; - a_offset4 += 8 * lda; - a_offset5 += 8 * lda; - a_offset6 += 8 * lda; - a_offset7 += 8 * lda; - - b_offset0 += 32; - b_offset1 += 32; - } - - if (rest) { - c0 = svld1_u16(pg16_first_8, a_offset0); - c1 = (rest >= 2 ? svld1_u16(pg16_first_8, a_offset1) : svdup_u16(0)); - c2 = (rest >= 3 ? svld1_u16(pg16_first_8, a_offset2) : svdup_u16(0)); - c3 = (rest >= 4 ? svld1_u16(pg16_first_8, a_offset3) : svdup_u16(0)); - c4 = (rest >= 5 ? svld1_u16(pg16_first_8, a_offset4) : svdup_u16(0)); - c5 = (rest >= 6 ? svld1_u16(pg16_first_8, a_offset5) : svdup_u16(0)); - c6 = (rest == 7 ? svld1_u16(pg16_first_8, a_offset6) : svdup_u16(0)); - c7 = (svdup_u16(0)); - - t0 = svzip1_u16(c0, c1); - t1 = svzip1_u16(c2, c3); - t2 = svzip1_u16(c4, c5); - t3 = svzip1_u16(c6, c7); - - m00 = svzip1_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); - m10 = svzip2_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); - m01 = svzip1_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); - m11 = svzip2_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); - - svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, - st_offsets_0, svreinterpret_u64_u32(m00)); - svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, - st_offsets_1, svreinterpret_u64_u32(m01)); - svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset1, - st_offsets_0, svreinterpret_u64_u32(m10)); - svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset1, - st_offsets_1, svreinterpret_u64_u32(m11)); - } - } - - if (n & 4) { - a_offset0 = a_offset; - a_offset1 = a_offset0 + lda; - a_offset2 = a_offset1 + lda; - a_offset3 = a_offset2 + lda; - a_offset4 = a_offset3 + lda; - a_offset5 = a_offset4 + lda; - a_offset6 = a_offset5 + lda; - a_offset7 = a_offset6 + lda; - a_offset += 4; - - b_offset0 = b_offset; - b_offset += 4 * pad_m; - - for (BLASLONG i = 0; i < m / 8; i++) { - // transpose 8x8 matrix and pack into two 4x8 block consists of two 2x4 - // small blocks - c0 = svld1_u16(pg16_first_4, a_offset0); - c1 = svld1_u16(pg16_first_4, a_offset1); - c2 = svld1_u16(pg16_first_4, a_offset2); - c3 = svld1_u16(pg16_first_4, a_offset3); - c4 = svld1_u16(pg16_first_4, a_offset4); - c5 = svld1_u16(pg16_first_4, a_offset5); - c6 = svld1_u16(pg16_first_4, a_offset6); - c7 = svld1_u16(pg16_first_4, a_offset7); - - t0 = svzip1_u16(c0, c1); - t1 = svzip1_u16(c2, c3); - t2 = svzip1_u16(c4, c5); - t3 = svzip1_u16(c6, c7); - - m00 = svzip1_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); - m01 = svzip1_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); - svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, - st_offsets_0, svreinterpret_u64_u32(m00)); - svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, - st_offsets_1, svreinterpret_u64_u32(m01)); - - a_offset0 += 8 * lda; - a_offset1 += 8 * lda; - a_offset2 += 8 * lda; - a_offset3 += 8 * lda; - a_offset4 += 8 * lda; - a_offset5 += 8 * lda; - a_offset6 += 8 * lda; - a_offset7 += 8 * lda; - - b_offset0 += 32; - } - - if (rest) { - c0 = svld1_u16(pg16_first_4, a_offset0); // rest >= 1 - c1 = (rest >= 2 ? svld1_u16(pg16_first_4, a_offset1) : svdup_u16(0)); - c2 = (rest >= 3 ? svld1_u16(pg16_first_4, a_offset2) : svdup_u16(0)); - c3 = (rest >= 4 ? svld1_u16(pg16_first_4, a_offset3) : svdup_u16(0)); - c4 = (rest >= 5 ? svld1_u16(pg16_first_4, a_offset4) : svdup_u16(0)); - c5 = (rest >= 6 ? svld1_u16(pg16_first_4, a_offset5) : svdup_u16(0)); - c6 = (rest == 7 ? svld1_u16(pg16_first_4, a_offset6) : svdup_u16(0)); - c7 = (svdup_u16(0)); - - t0 = svzip1_u16(c0, c1); - t1 = svzip1_u16(c2, c3); - t2 = svzip1_u16(c4, c5); - t3 = svzip1_u16(c6, c7); - - m00 = svzip1_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); - m01 = svzip1_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); - - svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, - st_offsets_0, svreinterpret_u64_u32(m00)); - svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, - st_offsets_1, svreinterpret_u64_u32(m01)); - } - } - - if (n & 2) { - a_offset0 = a_offset; - a_offset1 = a_offset0 + lda; - a_offset2 = a_offset1 + lda; - a_offset3 = a_offset2 + lda; - a_offset4 = a_offset3 + lda; - a_offset5 = a_offset4 + lda; - a_offset6 = a_offset5 + lda; - a_offset7 = a_offset6 + lda; - a_offset += 2; - - b_offset0 = b_offset; - b_offset1 = b_offset0 + 8; - - b_offset += 2 * pad_m; - - for (BLASLONG i = 0; i < m / 8; i++) { - for (BLASLONG line = 0; line < 2; line++) { - b_offset0[line * 4] = a_offset0[line]; - b_offset0[line * 4 + 1] = a_offset1[line]; - b_offset0[line * 4 + 2] = a_offset2[line]; - b_offset0[line * 4 + 3] = a_offset3[line]; - - b_offset1[line * 4] = a_offset4[line]; - b_offset1[line * 4 + 1] = a_offset5[line]; - b_offset1[line * 4 + 2] = a_offset6[line]; - b_offset1[line * 4 + 3] = a_offset7[line]; - } - b_offset0 += 16; - b_offset1 += 16; - - a_offset0 += 8 * lda; - a_offset1 += 8 * lda; - a_offset2 += 8 * lda; - a_offset3 += 8 * lda; - a_offset4 += 8 * lda; - a_offset5 += 8 * lda; - a_offset6 += 8 * lda; - a_offset7 += 8 * lda; - } - - if (rest) { - for (BLASLONG line = 0; line < 2; line++) { - b_offset0[line * 4] = a_offset0[line]; - b_offset0[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; - b_offset0[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; - b_offset0[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; - - b_offset1[line * 4] = rest <= 4 ? 0 : a_offset4[line]; - b_offset1[line * 4 + 1] = rest <= 5 ? 0 : a_offset5[line]; - b_offset1[line * 4 + 2] = rest <= 6 ? 0 : a_offset6[line]; - b_offset1[line * 4 + 3] = 0; - } - } - } - - if (n & 1) { - a_offset0 = a_offset; - a_offset1 = a_offset0 + lda; - a_offset2 = a_offset1 + lda; - a_offset3 = a_offset2 + lda; - a_offset4 = a_offset3 + lda; - a_offset5 = a_offset4 + lda; - a_offset6 = a_offset5 + lda; - a_offset7 = a_offset6 + lda; - - for (BLASLONG i = 0; i < m / 8; i++) { - b_offset[0] = a_offset0[0]; - b_offset[1] = a_offset1[0]; - b_offset[2] = a_offset2[0]; - b_offset[3] = a_offset3[0]; - - b_offset[4] = 0; - b_offset[5] = 0; - b_offset[6] = 0; - b_offset[7] = 0; - - b_offset[8] = a_offset4[0]; - b_offset[9] = a_offset5[0]; - b_offset[10] = a_offset6[0]; - b_offset[11] = a_offset7[0]; - - b_offset[12] = 0; - b_offset[13] = 0; - b_offset[14] = 0; - b_offset[15] = 0; - - b_offset += 16; - a_offset0 += 8 * lda; - a_offset1 += 8 * lda; - a_offset2 += 8 * lda; - a_offset3 += 8 * lda; - a_offset4 += 8 * lda; - a_offset5 += 8 * lda; - a_offset6 += 8 * lda; - a_offset7 += 8 * lda; - } - - if (rest) { - b_offset[0] = *a_offset0; - b_offset[1] = rest == 1 ? 0 : *a_offset1; - b_offset[2] = rest <= 2 ? 0 : *a_offset2; - b_offset[3] = rest <= 3 ? 0 : *a_offset3; - - b_offset[4] = 0; - b_offset[5] = 0; - b_offset[6] = 0; - b_offset[7] = 0; - - b_offset[8] = rest <= 4 ? 0 : *a_offset4; - b_offset[9] = rest <= 5 ? 0 : *a_offset5; - b_offset[10] = rest <= 6 ? 0 : *a_offset6; - b_offset[11] = 0; - - b_offset[12] = 0; - b_offset[13] = 0; - b_offset[14] = 0; - b_offset[15] = 0; - } - } - - return 0; -} diff --git a/kernel/arm64/sbgemv_t_bfdot.c b/kernel/arm64/sbgemv_t_bfdot.c index 672f70acf2..4de245d3bc 100644 --- a/kernel/arm64/sbgemv_t_bfdot.c +++ b/kernel/arm64/sbgemv_t_bfdot.c @@ -33,16 +33,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" -int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, BLASLONG incx, float beta, float *y, BLASLONG incy) +#ifdef BGEMM +#define INNER_FLOAT bfloat16_t +#define TO32(x) vcvtah_f32_bf16(x) +#define FROM32(x) vcvth_bf16_f32(x) +#else +#define INNER_FLOAT float +#define TO32(x) x +#define FROM32(x) x +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, BLASLONG incx, FLOAT beta, FLOAT *y_in, BLASLONG incy) { if (m < 1 || n < 1) return(0); + BLASLONG i; BLASLONG ix,iy; BLASLONG j; bfloat16_t *a_ptr; bfloat16_t *x_ptr; - float *y_ptr; - float temp; + float temp, temp0, temp1, temp2, temp3; + +#ifdef BGEMM + bfloat16_t alpha_bf16, beta_bf16; + memcpy(&alpha_bf16, &alpha, sizeof(bfloat16_t)); + memcpy(&beta_bf16, &beta, sizeof(bfloat16_t)); + float alpha_f32 = vcvtah_f32_bf16(alpha_bf16); + float beta_f32 = vcvtah_f32_bf16(beta_bf16); +#else + float alpha_f32 = alpha; + float beta_f32 = beta; +#endif + INNER_FLOAT *y = (INNER_FLOAT *)y_in; + INNER_FLOAT *y_ptr; iy = 0; a_ptr = (bfloat16_t*)(a); @@ -56,10 +79,10 @@ int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat bfloat16_t *a2_ptr = a_ptr + lda * width * 2; bfloat16_t *a3_ptr = a_ptr + lda * width * 3; - float *y0_ptr = y + incy * width * 0; - float *y1_ptr = y + incy * width * 1; - float *y2_ptr = y + incy * width * 2; - float *y3_ptr = y + incy * width * 3; + INNER_FLOAT *y0_ptr = y + incy * width * 0; + INNER_FLOAT *y1_ptr = y + incy * width * 1; + INNER_FLOAT *y2_ptr = y + incy * width * 2; + INNER_FLOAT *y3_ptr = y + incy * width * 3; for (j = 0; j < width; j++) { float32x4_t temp0_vec = vdupq_n_f32(0.0f); @@ -113,26 +136,31 @@ int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat i += 4; } - if (beta == 0.0f) { - y0_ptr[iy] = alpha * vaddvq_f32(temp0_vec); - y1_ptr[iy] = alpha * vaddvq_f32(temp1_vec); - y2_ptr[iy] = alpha * vaddvq_f32(temp2_vec); - y3_ptr[iy] = alpha * vaddvq_f32(temp3_vec); - } - else { - y0_ptr[iy] = alpha * vaddvq_f32(temp0_vec) + beta * y0_ptr[iy]; - y1_ptr[iy] = alpha * vaddvq_f32(temp1_vec) + beta * y1_ptr[iy]; - y2_ptr[iy] = alpha * vaddvq_f32(temp2_vec) + beta * y2_ptr[iy]; - y3_ptr[iy] = alpha * vaddvq_f32(temp3_vec) + beta * y3_ptr[iy]; + + if (beta_f32 == 0.0f) { + temp0 = alpha_f32 * vaddvq_f32(temp0_vec); + temp1 = alpha_f32 * vaddvq_f32(temp1_vec); + temp2 = alpha_f32 * vaddvq_f32(temp2_vec); + temp3 = alpha_f32 * vaddvq_f32(temp3_vec); + } else { + temp0 = alpha_f32 * vaddvq_f32(temp0_vec) + beta_f32 * TO32(y0_ptr[iy]); + temp1 = alpha_f32 * vaddvq_f32(temp1_vec) + beta_f32 * TO32(y1_ptr[iy]); + temp2 = alpha_f32 * vaddvq_f32(temp2_vec) + beta_f32 * TO32(y2_ptr[iy]); + temp3 = alpha_f32 * vaddvq_f32(temp3_vec) + beta_f32 * TO32(y3_ptr[iy]); } for (; i < m; ++i) { - y0_ptr[iy] += alpha * vcvtah_f32_bf16(a0_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); - y1_ptr[iy] += alpha * vcvtah_f32_bf16(a1_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); - y2_ptr[iy] += alpha * vcvtah_f32_bf16(a2_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); - y3_ptr[iy] += alpha * vcvtah_f32_bf16(a3_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); + temp0 = temp0 + alpha_f32 * vcvtah_f32_bf16(a0_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); + temp1 = temp1 + alpha_f32 * vcvtah_f32_bf16(a1_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); + temp2 = temp2 + alpha_f32 * vcvtah_f32_bf16(a2_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); + temp3 = temp3 + alpha_f32 * vcvtah_f32_bf16(a3_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); } + y0_ptr[iy] = FROM32(temp0); + y1_ptr[iy] = FROM32(temp1); + y2_ptr[iy] = FROM32(temp2); + y3_ptr[iy] = FROM32(temp3); + iy += incy; a0_ptr += lda; @@ -164,16 +192,16 @@ int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat i += 4; } - if (beta == 0.0f) { - y_ptr[iy] = alpha * vaddvq_f32(temp0_vec); - } - else { - y_ptr[iy] = alpha * vaddvq_f32(temp0_vec) + beta * y_ptr[iy]; - } + if (beta_f32 == 0.0f) { + temp = alpha_f32 * vaddvq_f32(temp0_vec); + } else { + temp = alpha_f32 * vaddvq_f32(temp0_vec) + beta_f32 * TO32(y_ptr[iy]); + } for (; i < m; ++i) { - y_ptr[iy] += alpha * vcvtah_f32_bf16(a_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); + temp += alpha_f32 * vcvtah_f32_bf16(a_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); } + y_ptr[iy] = FROM32(temp); iy += incy; @@ -189,11 +217,11 @@ int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat temp += vcvtah_f32_bf16(a_ptr[i]) * vcvtah_f32_bf16(x_ptr[ix]); ix += incx; } - if (beta == 0.0f) { - y[iy] = alpha * temp; + if (beta_f32 == 0.0f) { + y[iy] = FROM32(alpha_f32 * temp); } else { - y[iy] = alpha * temp + beta * y[iy]; + y[iy] = FROM32(alpha_f32 * temp + beta_f32 * TO32(y[iy])); } iy += incy; a_ptr += lda; diff --git a/kernel/arm64/sgemm_direct_alpha_beta_arm64_sme1.c b/kernel/arm64/sgemm_direct_alpha_beta_arm64_sme1.c new file mode 100644 index 0000000000..f9017c7645 --- /dev/null +++ b/kernel/arm64/sgemm_direct_alpha_beta_arm64_sme1.c @@ -0,0 +1,216 @@ +/* + Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + SPDX-License-Identifier: BSD-3-Clause-Clear +*/ + +#include "common.h" +#include +#include +#include +#include "sme_abi.h" + +#if defined(DYNAMIC_ARCH) +#define COMBINE(a,b) a ## b +#define COMBINE2(a,b) COMBINE(a,b) +#define SME1_PREPROCESS_BASE sgemm_direct_sme1_preprocess +#define SME1_PREPROCESS COMBINE2(SME1_PREPROCESS_BASE,TS) +#define SME1_KERNEL2X2_BASE sgemm_direct_alpha_beta_sme1_2VLx2VL +#define SME1_KERNEL2X2 COMBINE2(SME1_KERNEL2X2_BASE,TS) +#else +#define SME1_PREPROCESS sgemm_direct_sme1_preprocess +#define SME1_KERNEL2X2 sgemm_direct_alpha_beta_sme1_2VLx2VL +#endif + +/* Function prototypes */ +extern void SME1_PREPROCESS(uint64_t nbr, uint64_t nbc,\ + const float * restrict a, float * a_mod); + +#if defined(HAVE_SME) + +#if defined(__ARM_FEATURE_SME) && defined(__clang__) && __clang_major__ >= 16 +#include +#endif + +/* Function Definitions */ +static uint64_t sve_cntw() { + uint64_t cnt; + asm volatile( + "rdsvl %[res], #1\n" + "lsr %[res], %[res], #2\n" + : [res] "=r" (cnt) :: + ); + return cnt; +} + +#if defined(__ARM_FEATURE_SME) && defined(__ARM_FEATURE_LOCALLY_STREAMING) && defined(__clang__) && __clang_major__ >= 16 +// Outer product kernel. +// Computes a 2SVL x 2SVL block of C, utilizing all four FP32 tiles of ZA. +__attribute__((always_inline)) inline void +kernel_2x2(const float *A, const float *B, float *C, size_t shared_dim, + size_t ldc, size_t block_rows, size_t block_cols, float alpha, float beta) + __arm_out("za") __arm_streaming { + + const uint64_t svl = svcntw(); + size_t ldb = ldc; + // Predicate set-up + svbool_t pg = svptrue_b32(); + svbool_t pg_a_0 = svwhilelt_b32_u64(0, block_rows); + svbool_t pg_a_1 = svwhilelt_b32_u64(svl, block_rows); + + svbool_t pg_b_0 = svwhilelt_b32_u64(0, block_cols); + svbool_t pg_b_1 = svwhilelt_b32_u64(svl, block_cols); + +#define pg_c_0 pg_b_0 +#define pg_c_1 pg_b_1 + + svzero_za(); + svfloat32_t beta_vec = svdup_f32(beta); + // Load C to ZA + for (size_t i = 0; i < MIN(svl, block_rows); i++) { + svfloat32_t row_c_0 = svld1(pg_c_0, &C[i * ldc]); + row_c_0 = svmul_x(pg, beta_vec, row_c_0); + svwrite_hor_za32_f32_m(/*tile*/0, /*slice*/i, pg_c_0, row_c_0); + + svfloat32_t row_c_1 = svld1(pg_c_1, &C[i * ldc + svl]); + row_c_1 = svmul_x(pg, beta_vec, row_c_1); + svwrite_hor_za32_f32_m(/*tile*/1, /*slice*/i, pg_c_1, row_c_1); + } + for (size_t i = svl; i < block_rows; i++) { + svfloat32_t row_c_0 = svld1(pg_c_0, &C[i * ldc]); + row_c_0 = svmul_x(pg, beta_vec, row_c_0); + svwrite_hor_za32_f32_m(/*tile*/2, /*slice*/i, pg_c_0, row_c_0); + + svfloat32_t row_c_1 = svld1(pg_c_1, &C[i * ldc + svl]); + row_c_1 = svmul_x(pg, beta_vec, row_c_1); + svwrite_hor_za32_f32_m(/*tile*/3, /*slice*/i, pg_c_1, row_c_1); + } + + svfloat32_t alpha_vec = svdup_f32(alpha); + // Iterate through shared dimension (K) + for (size_t k = 0; k < shared_dim; k++) { + // Load column of A + svfloat32_t col_a_0 = svld1(pg_a_0, &A[k * svl]); + col_a_0 = svmul_x(pg, alpha_vec, col_a_0); + svfloat32_t col_a_1 = svld1(pg_a_1, &A[(k + shared_dim) * svl]); + col_a_1 = svmul_x(pg, alpha_vec, col_a_1); + // Load row of B + svfloat32_t row_b_0 = svld1(pg_b_0, &B[k * ldb]); + svfloat32_t row_b_1 = svld1(pg_b_1, &B[k * ldb + svl]); + // Perform outer product + svmopa_za32_m(/*tile*/0, pg, pg, col_a_0, row_b_0); + svmopa_za32_m(/*tile*/1, pg, pg, col_a_0, row_b_1); + svmopa_za32_m(/*tile*/2, pg, pg, col_a_1, row_b_0); + svmopa_za32_m(/*tile*/3, pg, pg, col_a_1, row_b_1); + } + + // Store to C from ZA + for (size_t i = 0; i < MIN(svl, block_rows); i++) { + svst1_hor_za32(/*tile*/0, /*slice*/i, pg_c_0, &C[i * ldc]); + svst1_hor_za32(/*tile*/1, /*slice*/i, pg_c_1, &C[i * ldc + svl]); + } + for (size_t i = svl; i < block_rows; i++) { + svst1_hor_za32(/*tile*/2, /*slice*/i, pg_c_0, &C[i * ldc]); + svst1_hor_za32(/*tile*/3, /*slice*/i, pg_c_1, &C[i * ldc + svl]); + } +return; +} + +__arm_new("za") __arm_locally_streaming +void SME1_KERNEL2X2(uint64_t m, uint64_t k, uint64_t n, const float* alpha,\ + const float *ba, const float *restrict bb, const float* beta,\ + float *restrict C) { + + const uint64_t num_rows = m; + const uint64_t num_cols = n; + + const float *restrict a_ptr = ba; + const float *restrict b_ptr = bb; + float *restrict c_ptr = C; + + const uint64_t svl = svcntw(); + const uint64_t ldc = n; + + // Block over rows of C (panels of A) + uint64_t row_idx = 0; + + // 2x2 loop + uint64_t row_batch = 2*svl; + + // Block over row dimension of C + for (; row_idx < num_rows; row_idx += row_batch) { + row_batch = MIN(row_batch, num_rows - row_idx); + + uint64_t col_idx = 0; + uint64_t col_batch = 2*svl; + + // Block over column dimension of C + for (; col_idx < num_cols; col_idx += col_batch) { + col_batch = MIN(col_batch, num_cols - col_idx); + + kernel_2x2(&a_ptr[row_idx * k], &b_ptr[col_idx], + &c_ptr[row_idx * ldc + col_idx], k, + ldc, row_batch, col_batch, *alpha, *beta); + } + } + return; +} + +#else +void SME1_KERNEL2X2(uint64_t m, uint64_t k, uint64_t n, const float* alpha,\ + const float *ba, const float *restrict bb, const float* beta,\ + float *restrict C){fprintf(stderr,"empty sgemm_alpha_beta2x2 should never get called!!!\n");} +#endif + +/*void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K,\ + float * __restrict A, BLASLONG strideA, float * __restrict B,\ + BLASLONG strideB , float * __restrict R, BLASLONG strideR) +*/ +void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float alpha, float * __restrict A,\ + BLASLONG strideA, float * __restrict B, BLASLONG strideB ,\ + float beta, float * __restrict R, BLASLONG strideR){ + + uint64_t m_mod, vl_elms; + + vl_elms = sve_cntw(); + + m_mod = ceil((double)M/(double)vl_elms) * vl_elms; + + float *A_mod = (float *) malloc(m_mod*K*sizeof(float)); + + /* Prevent compiler optimization by reading from memory instead + * of reading directly from vector (z) registers. + * */ + asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", + "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", + "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", + "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", + "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", + "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); + + /* Pre-process the left matrix to make it suitable for + matrix sum of outer-product calculation + */ + + SME1_PREPROCESS(M, K, A, A_mod); + + asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", + "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15","d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", + "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", + "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", + "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", + "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); + + /* Calculate C = alpha*A*B + beta*C */ + + SME1_KERNEL2X2(M, K, N, &alpha, A_mod, B, &beta, R); + + free(A_mod); +} + +#else + +void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float alpha, float * __restrict A,\ + BLASLONG strideA, float * __restrict B, BLASLONG strideB ,\ + float beta, float * __restrict R, BLASLONG strideR){fprintf(stderr,"empty sgemm_direct_alpha_beta should not be called!!!\n");} +#endif + diff --git a/kernel/arm64/sgemm_direct_arm64_sme1.c b/kernel/arm64/sgemm_direct_arm64_sme1.c index 13c337a13e..b9e79f1977 100644 --- a/kernel/arm64/sgemm_direct_arm64_sme1.c +++ b/kernel/arm64/sgemm_direct_arm64_sme1.c @@ -7,18 +7,29 @@ #include #include #include +#if defined(DYNAMIC_ARCH) +#define COMBINE(a,b) a ## b +#define COMBINE2(a,b) COMBINE(a,b) +#define SME1_PREPROCESS_BASE sgemm_direct_sme1_preprocess +#define SME1_PREPROCESS COMBINE2(SME1_PREPROCESS_BASE,TS) +#define SME1_DIRECT2X2_BASE sgemm_direct_sme1_2VLx2VL +#define SME1_DIRECT2X2 COMBINE2(SME1_DIRECT2X2_BASE,TS) +#else +#define SME1_PREPROCESS sgemm_direct_sme1_preprocess +#define SME1_DIRECT2X2 sgemm_direct_sme1_2VLx2VL +#endif #if defined(HAVE_SME) - /* Function prototypes */ -extern void sgemm_direct_sme1_preprocess(uint64_t nbr, uint64_t nbc,\ - const float * restrict a, float * a_mod) __asm__("sgemm_direct_sme1_preprocess"); -extern void sgemm_direct_sme1_2VLx2VL(uint64_t m, uint64_t k, uint64_t n,\ +extern void SME1_PREPROCESS(uint64_t nbr, uint64_t nbc,\ + const float * restrict a, float * a_mod) ; + +extern void SME1_DIRECT2X2(uint64_t m, uint64_t k, uint64_t n,\ const float * matLeft,\ const float * restrict matRight,\ - const float * restrict matResult) __asm__("sgemm_direct_sme1_2VLx2VL"); + const float * restrict matResult) ; /* Function Definitions */ -uint64_t sve_cntw() { +static uint64_t sve_cntw() { uint64_t cnt; asm volatile( "rdsvl %[res], #1\n" @@ -39,7 +50,6 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ uint64_t m_mod, vl_elms; vl_elms = sve_cntw(); - m_mod = ceil((double)M/(double)vl_elms) * vl_elms; float *A_mod = (float *) malloc(m_mod*K*sizeof(float)); @@ -48,7 +58,7 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ * of reading directly from vector (z) registers. * */ asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", - "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", + "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", @@ -57,13 +67,13 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ /* Pre-process the left matrix to make it suitable for matrix sum of outer-product calculation */ - sgemm_direct_sme1_preprocess(M, K, A, A_mod); + SME1_PREPROCESS(M, K, A, A_mod); /* Calculate C = A*B */ - sgemm_direct_sme1_2VLx2VL(M, K, N, A_mod, B, R); + SME1_DIRECT2X2(M, K, N, A_mod, B, R); asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", - "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", + "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", @@ -75,6 +85,16 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ BLASLONG strideA, float * __restrict B, BLASLONG strideB ,\ - float * __restrict R, BLASLONG strideR){} - + float * __restrict R, BLASLONG strideR){ +fprintf(stderr,"EMPTY sgemm_kernel_direct should never be called \n"); +} +void SME1_DIRECT2X2( uint64_t M , uint64_t K, uint64_t N,\ + const float * restrict A_base,\ + const float * restrict B_base,\ + const float * restrict C_base){}; +void SME1_PREPROCESS(uint64_t nbr, uint64_t nbc,\ + const float * restrict a, float * a_mod){}; + + #endif + diff --git a/kernel/arm64/sgemm_direct_performant.c b/kernel/arm64/sgemm_direct_performant.c new file mode 100644 index 0000000000..a20670e8b7 --- /dev/null +++ b/kernel/arm64/sgemm_direct_performant.c @@ -0,0 +1,15 @@ +#include "common.h" +/* helper for the direct sgemm code adapted from Arjan van der Ven's x86_64 version */ + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K) +{ +if (M<3) return 0; + unsigned long long mnk = M * N * K; + /* benchmark performance on M4 peaks around 512 and crosses the graph of the NEON SGEMM at about 3100 */ + if (mnk >= 3100L * 3100L * 3100L) + return 0; + + return 1; +} + + diff --git a/kernel/arm64/sgemm_direct_sme1.S b/kernel/arm64/sgemm_direct_sme1_2VLx2VL.S similarity index 95% rename from kernel/arm64/sgemm_direct_sme1.S rename to kernel/arm64/sgemm_direct_sme1_2VLx2VL.S index 8c0a173f3d..afb662c1fb 100644 --- a/kernel/arm64/sgemm_direct_sme1.S +++ b/kernel/arm64/sgemm_direct_sme1_2VLx2VL.S @@ -35,16 +35,17 @@ #define K_exit x15 //Exit condition for K loop #define M_cntr x16 //M loop counter #define C1 x17 //Constant1: N*(SVLs+1);SVLs-No. of 32-bit elements -#define C2 x18 //Constant2: N + SVLs -#define C3 x19 //Constant3: K*SVLs + SVLs -#define C4 x20 //Constant4: SVLs-2 -#define C5 x21 //Constant5: K*SVLs -#define C6 x22 //Constant6: N*SVLs +#define C2 x19 //Constant2: N + SVLs +#define C3 x20 //Constant3: K*SVLs + SVLs +#define C4 x21 //Constant4: SVLs-2 +#define C5 x22 //Constant5: K*SVLs +#define C6 x23 //Constant6: N*SVLs .text - .global sgemm_direct_sme1_2VLx2VL + .global ASMNAME - sgemm_direct_sme1_2VLx2VL: + ASMNAME: + //sgemm_direct_sme1_2VLx2VL: stp x19, x20, [sp, #-48]! stp x21, x22, [sp, #16] @@ -61,7 +62,7 @@ add C2, N, C4 //N + SVLs add C3, C5, C4 //K*SVLs + SVLs whilelt p2.s, M_cntr, M //Tile 0,1 predicate (M dimension) - sub w20, w20, #2 //SVLs-2 + sub w21, w21, #2 //SVLs-2 .M_Loop: incw M_cntr @@ -198,7 +199,7 @@ process_K_less_than_equal_2: st1w {za1h.s[w13, #0]}, p5, [Cptr1] st1w {za2h.s[w13, #0]}, p6, [Cptr0, C6, lsl #2] st1w {za3h.s[w13, #0]}, p7, [Cptr1, C6, lsl #2] - cmp w13, w20 + cmp w13, w21 b.mi .Loop_store_ZA psel p4, p0, p2.s[w13, 1] psel p5, p1, p2.s[w13, 1] @@ -211,12 +212,12 @@ process_K_less_than_equal_2: addvl Cptr, Cptr, #2 addvl Bptr, Bptr, #1 whilelt p0.b, Bptr, N_exit //1st Tile predicate (N dimension) - b.first .N_Loop + b.mi .N_Loop add A_base, A_base, C5, lsl #3 //A_base += 2*K*SVLs FP32 elements add C_base, C_base, C6, lsl #3 //C_base += 2*N*SVLs FP32 elements incw M_cntr whilelt p2.s, M_cntr, M //1st Tile predicate (M dimension) - b.first .M_Loop + b.mi .M_Loop smstop diff --git a/kernel/arm64/sgemm_direct_sme1_preprocess.S b/kernel/arm64/sgemm_direct_sme1_preprocess.S index fa13620751..6c51b0bf63 100644 --- a/kernel/arm64/sgemm_direct_sme1_preprocess.S +++ b/kernel/arm64/sgemm_direct_sme1_preprocess.S @@ -37,9 +37,9 @@ #define C6 x15 //Constant6: 3*ncol .text - .global sgemm_direct_sme1_preprocess + .global ASMNAME //sgemm_direct_sme1_preprocess - sgemm_direct_sme1_preprocess: + ASMNAME: //sgemm_direct_sme1_preprocess: stp x19, x20, [sp, #-48]! stp x21, x22, [sp, #16] @@ -114,14 +114,14 @@ addvl mat_ptr0, mat_ptr0, #1 //mat_ptr0 += SVLb whilelt p8.b, mat_ptr0, inner_loop_exit - b.first .Loop_process + b.mi .Loop_process add mat_mod, mat_mod, C3, lsl #2 //mat_mod+=SVLs*nbc FP32 elements add mat, mat, C3, lsl #2 //mat+=SVLs*nbc FP32 elements incw outer_loop_cntr whilelt p0.s, outer_loop_cntr, nrow - b.first .M_Loop + b.mi .M_Loop smstop diff --git a/kernel/arm64/sme_abi.h b/kernel/arm64/sme_abi.h new file mode 100644 index 0000000000..07bba4895b --- /dev/null +++ b/kernel/arm64/sme_abi.h @@ -0,0 +1,46 @@ +/*************************************************************************** + * Copyright (c) 2024, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE + * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF + * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#pragma once + +#include + +/** + * * These are SME ABI routines for saving & restoring SME state. + * * They are typically provided by a compiler runtime library such + * * as libgcc or compiler-rt, but support for these routines is not + * * yet available on all platforms. + * * + * * Define these as aborting stubs so that we loudly fail on nested + * * usage of SME state. + * * + * * These are defined as weak symbols so that a compiler runtime can + * * override them if supported. + * */ +__attribute__((weak)) void __arm_tpidr2_save() { abort(); } +__attribute__((weak)) void __arm_tpidr2_restore() { abort(); } + diff --git a/kernel/arm64/ssymm_direct_alpha_beta_arm64_sme1.c b/kernel/arm64/ssymm_direct_alpha_beta_arm64_sme1.c new file mode 100644 index 0000000000..eeabab4c23 --- /dev/null +++ b/kernel/arm64/ssymm_direct_alpha_beta_arm64_sme1.c @@ -0,0 +1,241 @@ +/* + Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + SPDX-License-Identifier: BSD-3-Clause-Clear +*/ + +#include "common.h" +#include +#include +#include +// #include "sme_abi.h" +#if defined(HAVE_SME) + +#if defined(__ARM_FEATURE_SME) && defined(__clang__) && __clang_major__ >= 16 +#include +#endif + +#if defined(DYNAMIC_ARCH) +#define COMBINE(a,b) a ## b +#define COMBINE2(a,b) COMBINE(a,b) +#define SGEMM_PREPROCESS_BASE sgemm_direct_sme1_preprocess +#define SGEMM_PREPROCESS COMBINE2(SGEMM_PREPROCESS_BASE,TS) +#define SGEMM_DIRECT2X2_BASE sgemm_direct_alpha_beta_sme1_2VLx2VL +#define SGEMM_DIRECT2X2 COMBINE2(SGEMM_DIRECT2X2_BASE,TS) +#else +#define SGEMM_PREPROCESS sgemm_direct_sme1_preprocess +#define SGEMM_DIRECT2X2 sgemm_direct_alpha_beta_sme1_2VLx2VL +#endif + +/* Function prototypes */ +extern void SGEMM_PREPROCESS(uint64_t nbr, uint64_t nbc,\ + const float * restrict a, float * a_mod); + +extern void SGEMM_DIRECT2X2(uint64_t m, uint64_t k, uint64_t n, const float* alpha,\ + const float *ba, const float *restrict bb, const float* beta,\ + float *restrict C); +/* Function Definitions */ +static uint64_t sve_cntw() { + uint64_t cnt; + asm volatile( + "rdsvl %[res], #1\n" + "lsr %[res], %[res], #2\n" + : [res] "=r" (cnt) :: + ); + return cnt; +} + +#if defined(__ARM_FEATURE_SME) && defined(__ARM_FEATURE_LOCALLY_STREAMING) && defined(__clang__) && __clang_major__ >= 16 + +__arm_new("za") __arm_locally_streaming +static void ssymm_direct_sme1_preprocessLU(uint64_t nbr, uint64_t nbc, + const float *restrict a, float *restrict a_mod) +{ + // const uint64_t num_rows = nbr; + // const uint64_t num_cols = nbc; + const uint64_t svl = svcntw(); + uint64_t row_batch = svl; + + float *restrict pSrc; + float *restrict pDst; + for (uint64_t row_idx = 0; row_idx < nbr; row_idx += row_batch) + { + row_batch = MIN(row_batch, nbr - row_idx); + + // Fill in the lower triangle and Transpose 1SVL x N panel of A + uint64_t col_batch = svl; + + for (uint64_t col_idx = 0; col_idx < nbc; col_idx += col_batch) + { + svzero_za(); + + if (col_idx == row_idx) + { + pSrc = &a[(row_idx)*nbc + col_idx]; + pDst = &a_mod[(col_idx)*svl + row_idx * nbc]; + // Load horizontal slices, filling lower elements + const svbool_t pg_row = svwhilelt_b32_u64(col_idx, nbc); + for (int64_t row = row_batch - 1; row >= 0; row--) + { + svld1_hor_za32(0, row, pg_row, &pSrc[row * nbc]); + svld1_ver_za32(0, row, pg_row, &pSrc[row * nbc]); + } + // Save vertical slices + col_batch = MIN(col_batch, nbc - col_idx); + for (uint64_t col = 0; col < col_batch; col++) + { + svst1_ver_za32(0, col, svptrue_b32(), &pDst[col * svl]); + } + } + else if (col_idx > row_idx) + { + pSrc = &a[(row_idx)*nbc + col_idx]; + pDst = &a_mod[(col_idx)*svl + row_idx * nbc]; + // Load horizontal slices + const svbool_t pg_row = svwhilelt_b32_u64(col_idx, nbc); + for (uint64_t row = 0; row < row_batch; row++) + { + svld1_hor_za32(0, row, pg_row, &pSrc[row * nbc]); + } + // Save vertical slices + col_batch = MIN(col_batch, nbc - col_idx); + for (uint64_t col = 0; col < col_batch; col++) + { + svst1_ver_za32(0, col, svptrue_b32(), &pDst[col * svl]); + } + } + else if (col_idx < row_idx) + { + pSrc = &a[row_idx + col_idx * nbc]; + pDst = &a_mod[(col_idx)*svl + row_idx * nbc]; + // Load horizontal slices + const svbool_t pg_row = svwhilelt_b32_u64(row_idx, nbc); + for (uint64_t row = 0; row < svl; row++) + { + svld1_hor_za32(0, row, pg_row, &pSrc[row * nbc]); + } + // Save vertical slices + col_batch = MIN(col_batch, nbc - col_idx); + for (uint64_t col = 0; col < svl; col++) + { + svst1_hor_za32(0, col, svptrue_b32(), &pDst[col * svl]); + } + } + } + } +} + +// +__arm_new("za") __arm_locally_streaming +static void ssymm_direct_sme1_preprocessLL(uint64_t nbr, uint64_t nbc, + const float *restrict a, float *restrict a_mod) +{ + // const uint64_t num_rows = nbr; + const uint64_t svl = svcntw(); + uint64_t row_batch = svl; + + float *restrict pSrc; + float *restrict pDst; + for (uint64_t row_idx = 0; row_idx < nbr; row_idx += row_batch) + { + row_batch = MIN(row_batch, nbr - row_idx); + + // Fill in the upper triangle and Transpose 1SVL x N panel of A + uint64_t col_batch = svl; + + for (uint64_t col_idx = 0; col_idx < nbc; col_idx += col_batch) + { + svzero_za(); + + if (col_idx == row_idx) + { + pSrc = &a[(row_idx)*nbc + col_idx]; + pDst = &a_mod[(col_idx)*svl + row_idx * nbc]; + // Load horizontal slices, filling upper elements + const svbool_t pg_row = svwhilelt_b32_u64(col_idx, nbc); + for (uint64_t row = 0; row < row_batch; row++) + { + svld1_hor_za32(0, row, pg_row, &pSrc[row * nbc]); + svld1_ver_za32(0, row, pg_row, &pSrc[row * nbc]); + } + // Save vertical slices + col_batch = MIN(col_batch, nbc - col_idx); + for (uint64_t col = 0; col < col_batch; col++) + { + svst1_ver_za32(0, col, svptrue_b32(), &pDst[col * svl]); + } + } + else if (col_idx > row_idx) + { + pSrc = &a[row_idx + col_idx * nbc]; + pDst = &a_mod[(col_idx)*svl + row_idx * nbc]; + // Load horizontal slices + const svbool_t pg_row = svptrue_b32(); + for (uint64_t row = 0; row < row_batch; row++) + { + svld1_hor_za32(0, row, pg_row, &pSrc[row * nbc]); + } + // Save vertical slices + col_batch = MIN(col_batch, nbc - col_idx); + for (uint64_t col = 0; col < col_batch; col++) + { + svst1_hor_za32(0, col, svptrue_b32(), &pDst[col * svl]); + } + } + else if (col_idx < row_idx) + { + pSrc = &a[(row_idx)*nbc + col_idx]; + pDst = &a_mod[(col_idx)*svl + row_idx * nbc]; + // Load horizontal slices + const svbool_t pg_row = svwhilelt_b32_u64(col_idx, nbc); + for (uint64_t row = 0; row < row_batch; row++) + { + svld1_hor_za32(0, row, pg_row, &pSrc[row * nbc]); + } + // Save vertical slices + col_batch = MIN(col_batch, nbc - col_idx); + for (uint64_t col = 0; col < col_batch; col++) + { + svst1_ver_za32(0, col, svptrue_b32(), &pDst[col * svl]); + } + } + } + } +} +#else +static void ssymm_direct_sme1_preprocessLU(uint64_t nbr, uint64_t nbc, + const float *restrict a, float *restrict a_mod){} +static void ssymm_direct_sme1_preprocessLL(uint64_t nbr, uint64_t nbc, + const float *restrict a, float *restrict a_mod){} +#endif + +// +void CNAME(BLASLONG M, BLASLONG N, float alpha, float *__restrict A, + BLASLONG strideA, float *__restrict B, BLASLONG strideB, + float beta, float *__restrict R, BLASLONG strideR) +{ + uint64_t vl_elms = sve_cntw(); // vl_elem = 16 + uint64_t m_mod = ceil((double)M / (double)vl_elms) * vl_elms; + + /* Pre-process the left matrix to make it suitable for + matrix sum of outer-product calculation + */ + float *A_mod = (float *)malloc(m_mod * M * sizeof(float)); + +#if defined(UPPER) + ssymm_direct_sme1_preprocessLU(M, M, A, A_mod); +#elif defined(LOWER) + ssymm_direct_sme1_preprocessLL(M, M, A, A_mod); +#endif + + /* Calculate C = alpha*A*B + beta*C */ + SGEMM_DIRECT2X2(M, M, N, &alpha, A_mod, B, &beta, R); + free(A_mod); +} + +#else + +void CNAME (BLASLONG M, BLASLONG N, float alpha, float * __restrict A,\ + BLASLONG strideA, float * __restrict B, BLASLONG strideB ,\ + float beta, float * __restrict R, BLASLONG strideR){} + +#endif diff --git a/kernel/arm64/ssyr2k_direct_alpha_beta_arm64_sme1.c b/kernel/arm64/ssyr2k_direct_alpha_beta_arm64_sme1.c new file mode 100755 index 0000000000..a8c36271ad --- /dev/null +++ b/kernel/arm64/ssyr2k_direct_alpha_beta_arm64_sme1.c @@ -0,0 +1,298 @@ +/* + Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + SPDX-License-Identifier: BSD-3-Clause-Clear +*/ + +#include "common.h" +#include +#include +#include + +#if defined(DYNAMIC_ARCH) +#define COMBINE(a,b) a ## b +#define COMBINE2(a,b) COMBINE(a,b) +#define SGEMM_PREPROCESS_BASE sgemm_direct_sme1_preprocess +#define SGEMM_PREPROCESS COMBINE2(SGEMM_PREPROCESS_BASE,TS) +#define SGEMM_DIRECT2X2_BASE sgemm_direct_alpha_beta_sme1_2VLx2VL +#define SGEMM_DIRECT2X2 COMBINE2(SGEMM_DIRECT2X2_BASE,TS) +#else +#define SGEMM_PREPROCESS sgemm_direct_sme1_preprocess +#define SGEMM_DIRECT2X2 sgemm_direct_alpha_beta_sme1_2VLx2VL +#endif +#if defined(HAVE_SME) + +#if defined(__ARM_FEATURE_SME) && defined(__clang__) && __clang_major__ >= 16 +#include +#endif + +/* Function prototypes */ +extern void SGEMM_PREPROCESS(uint64_t nbr, uint64_t nbc,\ + const float * restrict a, float * a_mod) ; + +/* Function Definitions */ +static uint64_t sve_cntw() { + uint64_t cnt; + asm volatile( + "rdsvl %[res], #1\n" + "lsr %[res], %[res], #2\n" + : [res] "=r" (cnt) :: + ); + return cnt; +} + +#if defined(__ARM_FEATURE_SME) && defined(__ARM_FEATURE_LOCALLY_STREAMING) && defined(__clang__) && __clang_major__ >= 16 +// Outer product kernel. +// Computes a 2SVL x 2SVL block of C, utilizing all four FP32 tiles of ZA. +__attribute__((always_inline)) inline void +kernel_2x2(const float *A, float *B_T, const float *B, float *A_T, float *C, size_t shared_dim, + size_t ldc, size_t block_rows, size_t block_cols, float alpha, + float beta, uint64_t row_idx, uint64_t col_idx) + __arm_out("za") __arm_streaming { + + const uint64_t svl = svcntw(); + size_t ldb = ldc; + // Predicate set-up + svbool_t pg = svptrue_b32(); + svbool_t pg_a_0 = svwhilelt_b32_u64(0, block_rows); + svbool_t pg_a_1 = svwhilelt_b32_u64(svl, block_rows); + + svbool_t pg_b_0 = svwhilelt_b32_u64(0, block_cols); + svbool_t pg_b_1 = svwhilelt_b32_u64(svl, block_cols); + +#define pg_c_0 pg_b_0 +#define pg_c_1 pg_b_1 + + svzero_za(); + svfloat32_t beta_vec = svdup_f32(beta); + + // Load C to ZA + for (size_t i = 0; i < MIN(svl, block_rows); i++) { + svfloat32_t row_c_0 = svld1(pg_c_0, &C[i * ldc]); + row_c_0 = svmul_x(pg, beta_vec, row_c_0); + svwrite_hor_za32_f32_m(/*tile*/0, /*slice*/i, pg_c_0, row_c_0); + + svfloat32_t row_c_1 = svld1(pg_c_1, &C[i * ldc + svl]); + row_c_1 = svmul_x(pg, beta_vec, row_c_1); + svwrite_hor_za32_f32_m(/*tile*/1, /*slice*/i, pg_c_1, row_c_1); + } + for (size_t i = svl; i < block_rows; i++) { + svfloat32_t row_c_0 = svld1(pg_c_0, &C[i * ldc]); + row_c_0 = svmul_x(pg, beta_vec, row_c_0); + svwrite_hor_za32_f32_m(/*tile*/2, /*slice*/i, pg_c_0, row_c_0); + + svfloat32_t row_c_1 = svld1(pg_c_1, &C[i * ldc + svl]); + row_c_1 = svmul_x(pg, beta_vec, row_c_1); + svwrite_hor_za32_f32_m(/*tile*/3, /*slice*/i, pg_c_1, row_c_1); + } + + svfloat32_t alpha_vec = svdup_f32(alpha); + // Iterate through shared dimension (K) + for (size_t k = 0; k < shared_dim; k++) { +#if !defined(TRANSA) + // Computes alpha*A*B**T + // Load column of A + svfloat32_t col_a_0 = svld1(pg_a_0, &A[k * svl]); + col_a_0 = svmul_x(pg, alpha_vec, col_a_0); + svfloat32_t col_a_1 = svld1(pg_a_1, &A[(k + shared_dim) * svl]); + col_a_1 = svmul_x(pg, alpha_vec, col_a_1); + + // Load row of B**T + svfloat32_t row_b_0 = svld1(pg_b_0, &B_T[k * svl]); + svfloat32_t row_b_1 = svld1(pg_b_1, &B_T[(k + shared_dim) * svl]); +#else + // Computes alpha*A**T*B + // Load column of A**T + svfloat32_t col_a_0 = svld1(pg_a_0, &A[k * ldb]); + col_a_0 = svmul_x(pg, alpha_vec, col_a_0); + + svfloat32_t col_a_1 = svld1(pg_a_1, &A[k * ldb + svl]); + col_a_1 = svmul_x(pg, alpha_vec, col_a_1); + + // Load row of B + svfloat32_t row_b_0 = svld1(pg_b_0, &B_T[k * ldb]); + svfloat32_t row_b_1 = svld1(pg_b_1, &B_T[k * ldb + svl]); +#endif + // Perform outer product + svmopa_za32_m(/*tile*/0, pg, pg, col_a_0, row_b_0); + svmopa_za32_m(/*tile*/1, pg, pg, col_a_0, row_b_1); + svmopa_za32_m(/*tile*/2, pg, pg, col_a_1, row_b_0); + svmopa_za32_m(/*tile*/3, pg, pg, col_a_1, row_b_1); + +#if !defined(TRANSA) + // Computes alpha*B*A**T + // Load column of B + col_a_0 = svld1(pg_a_0, &B[k * svl]); + col_a_0 = svmul_x(pg, alpha_vec, col_a_0); + col_a_1 = svld1(pg_a_1, &B[(k + shared_dim) * svl]); + col_a_1 = svmul_x(pg, alpha_vec, col_a_1); + + // Load row of A**T + row_b_0 = svld1(pg_b_0, &A_T[k * svl]); + row_b_1 = svld1(pg_b_1, &A_T[(k + shared_dim) * svl]); +#else + // Computes alpha*B**T*A + // Load column of B**T + col_a_0 = svld1(pg_a_0, &B[k * ldb]); + col_a_0 = svmul_x(pg, alpha_vec, col_a_0); + + col_a_1 = svld1(pg_a_1, &B[k * ldb + svl]); + col_a_1 = svmul_x(pg, alpha_vec, col_a_1); + + // Load row of A + row_b_0 = svld1(pg_b_0, &A_T[k * ldb]); + row_b_1 = svld1(pg_b_1, &A_T[k * ldb + svl]); +#endif + // Perform outer product + svmopa_za32_m(/*tile*/0, pg, pg, col_a_0, row_b_0); + svmopa_za32_m(/*tile*/1, pg, pg, col_a_0, row_b_1); + svmopa_za32_m(/*tile*/2, pg, pg, col_a_1, row_b_0); + svmopa_za32_m(/*tile*/3, pg, pg, col_a_1, row_b_1); + } + +#if defined(UPPER) +#define pg_c_0_full pg_c_0 +#define pg_c_1_full pg_c_1 + + bool need_update_pg_b = true; + size_t last_invalid_index = col_idx - row_idx; + // For Upper, If col_idx - row_idx >= 2*svl, we don't need to update the predicate due to all elements above the digonal + if (col_idx - row_idx >= 2*svl) { + need_update_pg_b = false; + } + // Store to C from ZA + for (size_t i = 0; i < MIN(svl, block_rows); i++, last_invalid_index++) { + if (need_update_pg_b) { + pg_c_0 = svnot_b_z(pg_c_0_full, svwhilelt_b32_u64(0, last_invalid_index)); + pg_c_1 = svnot_b_z(pg_c_1_full, svwhilelt_b32_u64(svl, last_invalid_index)); + } + svst1_hor_za32(/*tile*/0, /*slice*/i, pg_c_0, &C[i * ldc]); + svst1_hor_za32(/*tile*/1, /*slice*/i, pg_c_1, &C[i * ldc + svl]); + } + for (size_t i = svl; i < block_rows; i++,last_invalid_index++) { + if (need_update_pg_b) { + pg_c_0 = svnot_b_z(pg_c_0_full, svwhilelt_b32_u64(0, last_invalid_index)); + pg_c_1 = svnot_b_z(pg_c_1_full, svwhilelt_b32_u64(svl, last_invalid_index)); + } + svst1_hor_za32(/*tile*/2, /*slice*/i, pg_c_0, &C[i * ldc]); + svst1_hor_za32(/*tile*/3, /*slice*/i, pg_c_1, &C[i * ldc + svl]); + } +#else + // Store to C from ZA + size_t valid_index = row_idx - col_idx + 1; + for (size_t i = 0; i < MIN(svl, block_rows); i++, valid_index++) { + pg_c_0 = svwhilelt_b32_u64(0, MIN(valid_index, block_cols)); + pg_c_1 = svwhilelt_b32_u64(svl, MIN(valid_index, block_cols)); + svst1_hor_za32(/*tile*/0, /*slice*/i, pg_c_0, &C[i * ldc]); + svst1_hor_za32(/*tile*/1, /*slice*/i, pg_c_1, &C[i * ldc + svl]); + } + for (size_t i = svl; i < block_rows; i++, valid_index++) { + pg_c_0 = svwhilelt_b32_u64(0, MIN(valid_index, block_cols)); + pg_c_1 = svwhilelt_b32_u64(svl, MIN(valid_index, block_cols)); + svst1_hor_za32(/*tile*/2, /*slice*/i, pg_c_0, &C[i * ldc]); + svst1_hor_za32(/*tile*/3, /*slice*/i, pg_c_1, &C[i * ldc + svl]); + } +#endif +} + +__arm_new("za") __arm_locally_streaming +static void ssyr2k_direct_sme1_2VLx2VL(uint64_t n, uint64_t k, const float* alpha,\ + const float *ba, const float *bb, const float* beta, float *restrict bc) { + const uint64_t num_rows = n; + const uint64_t num_cols = n; + + const float *restrict a_ptr = ba; + const float *restrict b_ptr = bb; + float *restrict c_ptr = bc; + + const uint64_t svl = svcntw(); + const uint64_t ldc = n; + + // Block over rows of C (panels of A) + uint64_t row_idx = 0; + + // 2x2 loop + uint64_t row_batch = 2*svl; + + // Block over row dimension of C + for (; row_idx < num_rows; row_idx += row_batch) { + row_batch = MIN(row_batch, num_rows - row_idx); + uint64_t col_batch = 2*svl; +#if defined(UPPER) + // for UPLO is upper, Start from column col_idx = rows_index to ensure we only process the upper triangle (col_idx >= rows_index) + for (uint64_t col_idx = row_idx; col_idx < num_cols; col_idx += col_batch) { + col_batch = MIN(col_batch, num_cols - col_idx); +#else + // for UPLO is lower, we only process the lower triangle part (col_idx <= row_idxx) + for (uint64_t col_idx = 0; col_idx < num_cols && col_idx <= row_idx; col_idx += col_batch) { +#endif + col_batch = MIN(col_batch, num_cols - col_idx); +#if !defined(TRANSA) + kernel_2x2(&a_ptr[row_idx * k], &b_ptr[col_idx * k], &b_ptr[row_idx * k], &a_ptr[col_idx * k], + &c_ptr[row_idx * ldc + col_idx], k, + ldc, row_batch, col_batch, *alpha, *beta, row_idx, col_idx); +#else + kernel_2x2(&a_ptr[row_idx], &b_ptr[col_idx], &b_ptr[row_idx], &a_ptr[col_idx], + &c_ptr[row_idx * ldc + col_idx], k, + ldc, row_batch, col_batch, *alpha, *beta, row_idx, col_idx); +#endif + + } + } + return; +} + +#else +static void ssyr2k_direct_sme1_2VLx2VL(uint64_t n, uint64_t k, const float* alpha,\ + const float *ba, const float *bb, const float* beta, float *restrict bc){} +#endif + +void CNAME (BLASLONG N, BLASLONG K, float alpha, float * __restrict A, \ + BLASLONG strideA, float * __restrict B, BLASLONG strideB, \ + float beta, float * __restrict R, BLASLONG strideR) +{ +#if !defined(TRANSA) + uint64_t n_mod, vl_elms; + + vl_elms = sve_cntw(); + + n_mod = ceil((double)N/(double)vl_elms) * vl_elms; + + float *A_mod = (float *) malloc(n_mod*K*sizeof(float)); + float *B_mod = (float *) malloc(n_mod*K*sizeof(float)); + + /* Prevent compiler optimization by reading from memory instead + * of reading directly from vector (z) registers. + * */ + asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", + "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", + "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", + "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", + "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", + "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); + + /* Pre-process the left matrix to make it suitable for + matrix sum of outer-product calculation + */ + SGEMM_PREPROCESS(N, K, A, A_mod); + SGEMM_PREPROCESS(N, K, B, B_mod); + asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", + "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", + "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", + "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", + "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", + "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); + + ssyr2k_direct_sme1_2VLx2VL(N, K, &alpha, A_mod, B_mod, &beta, R); + free(A_mod); + free(B_mod); +#else + ssyr2k_direct_sme1_2VLx2VL(N, K, &alpha, A, B, &beta, R); +#endif +} +#else + +void CNAME (BLASLONG N, BLASLONG K, float alpha, float * __restrict A, \ + BLASLONG strideA, float * __restrict B, BLASLONG strideB, \ + float beta, float * __restrict C, BLASLONG strideC){} + +#endif diff --git a/kernel/arm64/ssyrk_direct_alpha_beta_arm64_sme1.c b/kernel/arm64/ssyrk_direct_alpha_beta_arm64_sme1.c new file mode 100644 index 0000000000..d1e6bcc92c --- /dev/null +++ b/kernel/arm64/ssyrk_direct_alpha_beta_arm64_sme1.c @@ -0,0 +1,265 @@ +/* + Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + SPDX-License-Identifier: BSD-3-Clause-Clear +*/ + +#include "common.h" +#include +#include +#include +#if defined(HAVE_SME) + +#if defined(DYNAMIC_ARCH) +#define COMBINE(a,b) a ## b +#define COMBINE2(a,b) COMBINE(a,b) +#define SGEMM_PREPROCESS_BASE sgemm_direct_sme1_preprocess +#define SGEMM_PREPROCESS COMBINE2(SGEMM_PREPROCESS_BASE,TS) +#define SGEMM_DIRECT2X2_BASE sgemm_direct_alpha_beta_sme1_2VLx2VL +#define SGEMM_DIRECT2X2 COMBINE2(SGEMM_DIRECT2X2_BASE,TS) +#else +#define SGEMM_PREPROCESS sgemm_direct_sme1_preprocess +#define SGEMM_DIRECT2X2 sgemm_direct_alpha_beta_sme1_2VLx2VL +#endif + +#if defined(__ARM_FEATURE_SME) && defined(__clang__) && __clang_major__ >= 16 +#include +#endif + +/* Function prototypes */ +extern void SGEMM_PREPROCESS (uint64_t nbr, uint64_t nbc,\ + + const float * restrict a, float * a_mod) ; + +/* Function Definitions */ +static uint64_t sve_cntw() { + uint64_t cnt; + asm volatile( + "rdsvl %[res], #1\n" + "lsr %[res], %[res], #2\n" + : [res] "=r" (cnt) :: + ); + return cnt; +} + +#if defined(__ARM_FEATURE_SME) && defined(__ARM_FEATURE_LOCALLY_STREAMING) && defined(__clang__) && __clang_major__ >= 16 +// Outer product kernel. +// Computes a 2SVL x 2SVL block of C, utilizing all four FP32 tiles of ZA. +__attribute__((always_inline)) inline void +kernel_2x2(const float *A, float *B, float *C, size_t shared_dim, + size_t ldc, size_t block_rows, size_t block_cols, float alpha, + float beta, uint64_t row_idx, uint64_t col_idx) + __arm_out("za") __arm_streaming { + + const uint64_t svl = svcntw(); + size_t ldb = ldc; + // Predicate set-up + svbool_t pg = svptrue_b32(); + svbool_t pg_a_0 = svwhilelt_b32_u64(0, block_rows); + svbool_t pg_a_1 = svwhilelt_b32_u64(svl, block_rows); + + svbool_t pg_b_0 = svwhilelt_b32_u64(0, block_cols); + svbool_t pg_b_1 = svwhilelt_b32_u64(svl, block_cols); + +#define pg_c_0 pg_b_0 +#define pg_c_1 pg_b_1 + + svzero_za(); + svfloat32_t beta_vec = svdup_f32(beta); + + // Load C to ZA + for (size_t i = 0; i < MIN(svl, block_rows); i++) { + svfloat32_t row_c_0 = svld1(pg_c_0, &C[i * ldc]); + row_c_0 = svmul_x(pg, beta_vec, row_c_0); + svwrite_hor_za32_f32_m(/*tile*/0, /*slice*/i, pg_c_0, row_c_0); + + svfloat32_t row_c_1 = svld1(pg_c_1, &C[i * ldc + svl]); + row_c_1 = svmul_x(pg, beta_vec, row_c_1); + svwrite_hor_za32_f32_m(/*tile*/1, /*slice*/i, pg_c_1, row_c_1); + } + for (size_t i = svl; i < block_rows; i++) { + svfloat32_t row_c_0 = svld1(pg_c_0, &C[i * ldc]); + row_c_0 = svmul_x(pg, beta_vec, row_c_0); + svwrite_hor_za32_f32_m(/*tile*/2, /*slice*/i, pg_c_0, row_c_0); + + svfloat32_t row_c_1 = svld1(pg_c_1, &C[i * ldc + svl]); + row_c_1 = svmul_x(pg, beta_vec, row_c_1); + svwrite_hor_za32_f32_m(/*tile*/3, /*slice*/i, pg_c_1, row_c_1); + } + + svfloat32_t alpha_vec = svdup_f32(alpha); + // Iterate through shared dimension (K) + for (size_t k = 0; k < shared_dim; k++) { +#if !defined(TRANSA) + // Load column of A + svfloat32_t col_a_0 = svld1(pg_a_0, &A[k * svl]); + col_a_0 = svmul_x(pg, alpha_vec, col_a_0); + svfloat32_t col_a_1 = svld1(pg_a_1, &A[(k + shared_dim) * svl]); + col_a_1 = svmul_x(pg, alpha_vec, col_a_1); + + // Load row of A**T + svfloat32_t row_b_0 = svld1(pg_b_0, &B[k * svl]); + svfloat32_t row_b_1 = svld1(pg_b_1, &B[(k + shared_dim) * svl]); +#else + // Load column of A**T + svfloat32_t col_a_0 = svld1(pg_a_0, &A[k * ldb]); + col_a_0 = svmul_x(pg, alpha_vec, col_a_0); + + svfloat32_t col_a_1 = svld1(pg_a_1, &A[k * ldb + svl]); + col_a_1 = svmul_x(pg, alpha_vec, col_a_1); + + // Load row of A + svfloat32_t row_b_0 = svld1(pg_b_0, &B[k * ldb]); + svfloat32_t row_b_1 = svld1(pg_b_1, &B[k * ldb + svl]); +#endif + // Perform outer product + svmopa_za32_m(/*tile*/0, pg, pg, col_a_0, row_b_0); + svmopa_za32_m(/*tile*/1, pg, pg, col_a_0, row_b_1); + svmopa_za32_m(/*tile*/2, pg, pg, col_a_1, row_b_0); + svmopa_za32_m(/*tile*/3, pg, pg, col_a_1, row_b_1); + } + +#if defined(UPPER) +#define pg_c_0_full pg_c_0 +#define pg_c_1_full pg_c_1 + + bool need_update_pg_b = true; + size_t last_invalid_index = col_idx - row_idx; + // For Upper, If col_idx - row_idx >= 2*svl, we don't need to update the predicate due to all elements above the digonal + if (col_idx - row_idx >= 2*svl) { + need_update_pg_b = false; + } + // Store to C from ZA + for (size_t i = 0; i < MIN(svl, block_rows); i++, last_invalid_index++) { + if (need_update_pg_b) { + pg_c_0 = svnot_b_z(pg_c_0_full, svwhilelt_b32_u64(0, last_invalid_index)); + pg_c_1 = svnot_b_z(pg_c_1_full, svwhilelt_b32_u64(svl, last_invalid_index)); + } + + svst1_hor_za32(/*tile*/0, /*slice*/i, pg_c_0, &C[i * ldc]); + svst1_hor_za32(/*tile*/1, /*slice*/i, pg_c_1, &C[i * ldc + svl]); + } + for (size_t i = svl; i < block_rows; i++,last_invalid_index++) { + if (need_update_pg_b) { + pg_c_0 = svnot_b_z(pg_c_0_full, svwhilelt_b32_u64(0, last_invalid_index)); + pg_c_1 = svnot_b_z(pg_c_1_full, svwhilelt_b32_u64(svl, last_invalid_index)); + } + svst1_hor_za32(/*tile*/2, /*slice*/i, pg_c_0, &C[i * ldc]); + svst1_hor_za32(/*tile*/3, /*slice*/i, pg_c_1, &C[i * ldc + svl]); + } +#else + // Store to C from ZA + size_t valid_index = row_idx - col_idx + 1; + for (size_t i = 0; i < MIN(svl, block_rows); i++, valid_index++) { + pg_c_0 = svwhilelt_b32_u64(0, MIN(valid_index, block_cols)); + pg_c_1 = svwhilelt_b32_u64(svl, MIN(valid_index, block_cols)); + svst1_hor_za32(/*tile*/0, /*slice*/i, pg_c_0, &C[i * ldc]); + svst1_hor_za32(/*tile*/1, /*slice*/i, pg_c_1, &C[i * ldc + svl]); + } + for (size_t i = svl; i < block_rows; i++, valid_index++) { + pg_c_0 = svwhilelt_b32_u64(0, MIN(valid_index, block_cols)); + pg_c_1 = svwhilelt_b32_u64(svl, MIN(valid_index, block_cols)); + svst1_hor_za32(/*tile*/2, /*slice*/i, pg_c_0, &C[i * ldc]); + svst1_hor_za32(/*tile*/3, /*slice*/i, pg_c_1, &C[i * ldc + svl]); + } +#endif +} + +__arm_new("za") __arm_locally_streaming +static void ssyrk_direct_sme1_2VLx2VL(uint64_t n, uint64_t k, const float* alpha,\ + const float *ba, const float* beta, float *restrict bc) { + const uint64_t num_rows = n; + const uint64_t num_cols = n; + + const float *restrict a_ptr = ba; + const float *restrict b_ptr = ba; + float *restrict c_ptr = bc; + + const uint64_t svl = svcntw(); + const uint64_t ldc = n; + + // Block over rows of C (panels of A) + uint64_t row_idx = 0; + + // 2x2 loop + uint64_t row_batch = 2*svl; + + // Block over row dimension of C + for (; row_idx < num_rows; row_idx += row_batch) { + row_batch = MIN(row_batch, num_rows - row_idx); + uint64_t col_batch = 2*svl; +#if defined(UPPER) + // for UPLO is upper, Start from column col_idx = rows_index to ensure we only process the upper triangle (col_idx >= rows_index) + for (uint64_t col_idx = row_idx; col_idx < num_cols; col_idx += col_batch) { + col_batch = MIN(col_batch, num_cols - col_idx); +#else + // for UPLO is lower, we only process the lower triangle part (col_idx <= row_idxx) + for (uint64_t col_idx = 0; col_idx < num_cols && col_idx <= row_idx; col_idx += col_batch) { +#endif + col_batch = MIN(col_batch, num_cols - col_idx); +#if !defined(TRANSA) + kernel_2x2(&a_ptr[row_idx * k], &b_ptr[col_idx * k], + &c_ptr[row_idx * ldc + col_idx], k, + ldc, row_batch, col_batch, *alpha, *beta, row_idx, col_idx); +#else + kernel_2x2(&a_ptr[row_idx], &b_ptr[col_idx], + &c_ptr[row_idx * ldc + col_idx], k, + ldc, row_batch, col_batch, *alpha, *beta, row_idx, col_idx); +#endif + + } + } + return; +} + +#else +static void ssyrk_direct_sme1_2VLx2VL(uint64_t n, uint64_t k, const float* alpha,\ + const float *ba, const float* beta, float *restrict bc){} +#endif + +void CNAME (BLASLONG N, BLASLONG K, float alpha, float * __restrict A,\ + BLASLONG strideA, float beta, float * __restrict C, BLASLONG strideC){ +#if !defined(TRANSA) + uint64_t n_mod, vl_elms; + + vl_elms = sve_cntw(); + + n_mod = ceil((double)N/(double)vl_elms) * vl_elms; + + float *A_mod = (float *) malloc(n_mod*K*sizeof(float)); + + /* Prevent compiler optimization by reading from memory instead + * of reading directly from vector (z) registers. + * */ + asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", + "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", + "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", + "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", + "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", + "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); + + /* Pre-process the left matrix to make it suitable for + matrix sum of outer-product calculation + */ + SGEMM_PREPROCESS (N, K, A, A_mod); + asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", + "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", + "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", + "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", + "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", + "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); + ssyrk_direct_sme1_2VLx2VL(N, K, &alpha, A_mod, &beta, C); + free(A_mod); +#else + ssyrk_direct_sme1_2VLx2VL(N, K, &alpha, A, &beta, C); +#endif + +} + +#else + +void CNAME (BLASLONG N, BLASLONG K, float alpha, float * __restrict A,\ + BLASLONG strideA, float beta, float * __restrict C, BLASLONG strideC){ +fprintf(stderr,"empty ssyrk_direct kernel should never be called\n"); +} + +#endif diff --git a/kernel/arm64/strmm_direct_arm64_sme1.c b/kernel/arm64/strmm_direct_arm64_sme1.c new file mode 100644 index 0000000000..f6a8505965 --- /dev/null +++ b/kernel/arm64/strmm_direct_arm64_sme1.c @@ -0,0 +1,260 @@ +/* + Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + SPDX-License-Identifier: BSD-3-Clause-Clear +*/ + +#include "common.h" +#include +#include +#include +//#include "sme_abi.h" +#if defined(HAVE_SME) + +#if defined(__ARM_FEATURE_SME) && defined(__clang__) && __clang_major__ >= 16 +#include +#endif + +/* Function Definitions */ +static uint64_t sve_cntw() { + uint64_t cnt; + asm volatile( + "rdsvl %[res], #1\n" + "lsr %[res], %[res], #2\n" + : [res] "=r" (cnt) :: + ); + return cnt; +} + +#if defined(__ARM_FEATURE_SME) && defined(__ARM_FEATURE_LOCALLY_STREAMING) && defined(__clang__) && __clang_major__ >= 16 + +// Transpose 1SVL x N panel of A +__attribute__((always_inline)) +inline static void transpose_panel_lower(const float *restrict a, float *restrict b, + uint64_t rows, uint64_t cols, + uint64_t a_step, uint64_t rows_index) +__arm_out("za") __arm_streaming { + // for Lower Trangular Matrix + uint64_t svl = svcntw(); + uint64_t col_batch = svl; + + svzero_za(); + uint64_t last_rows_index = rows_index + rows - 1; + for (uint64_t k = 0; k < cols; k += col_batch) { + if (last_rows_index < k) { + // Early exit: if all rows are above the diagonal, no valid elements remain + break; + } + // Load to horizontal slices + for (uint64_t row = 0; row < rows; row++) { + svbool_t pg_row = svwhilelt_b32_u64(k, MIN(rows_index + row + 1, cols)); + svld1_hor_za32(0, row, pg_row, &a[row * a_step + k]); + } + + // Save from vertical slices + col_batch = MIN(col_batch, cols - k); + for (uint64_t col = 0; col < col_batch; col++) { + svst1_ver_za32(0, col, svptrue_b32(), &b[(col + k) * svl]); + } + } +} + +__attribute__((always_inline)) +inline static void transpose_panel_upper(const float *restrict a, float *restrict b, + uint64_t rows, uint64_t cols, + uint64_t a_step, uint64_t rows_index) +__arm_out("za") __arm_streaming { + // for Upper Trangular Matrix + uint64_t svl = svcntw(); + uint64_t col_batch = svl; + + svzero_za(); + // Start from column k = rows_index to ensure we only process the upper triangle (k >= rows_index) + for (uint64_t k = rows_index; k < cols; k += col_batch) { + // Load to horizontal slices + for (uint64_t row = 0; row < rows; row++) { + svbool_t pg_row = svwhilelt_b32_u64(k, cols); + svld1_hor_za32(0, row, pg_row, &a[row * a_step + k]); + } + + // Save from vertical slices + col_batch = MIN(col_batch, cols - k); + for (uint64_t col = 0, real_col = k; col < col_batch; col++, real_col++) { + // Only the upper triangular part of the matrix is stored. + svbool_t pg_col = svwhilelt_b32_u64(rows_index, real_col + 1); + svst1_ver_za32(0, col, pg_col, &b[(col + k) * svl]); + } + } +} + +__arm_new("za") __arm_locally_streaming +static void strmm_direct_sme1_preprocess(uint64_t nbr, uint64_t nbc, + const float *restrict a, float *restrict a_mod) { + const uint64_t num_rows = nbr; + uint64_t row_batch = svcntsw(); + for (uint64_t row_idx = 0; row_idx < num_rows; row_idx += row_batch) { + // Transpose 1SVL x N panel of A + row_batch = MIN(row_batch, num_rows - row_idx); +#if !defined(UPPER) + transpose_panel_lower(&a[row_idx * nbc], &a_mod[row_idx * nbc], row_batch, nbc, nbc, row_idx); +#else + transpose_panel_upper(&a[row_idx * nbc], &a_mod[row_idx * nbc], row_batch, nbc, nbc, row_idx); +#endif + } +} + +// Outer product kernel. +// Computes a 2SVL x 2SVL block of C, utilizing all four FP32 tiles of ZA. +__attribute__((always_inline)) inline void +kernel_2x2(const float *A, const float *B, float *C, size_t shared_dim, + size_t ldc, size_t block_rows, size_t block_cols, float alpha, uint64_t row_idx) + __arm_out("za") __arm_streaming { + const uint64_t svl = svcntw(); + size_t ldb = ldc; + // Predicate set-up + svbool_t pg = svptrue_b32(); + svbool_t pg_a_0 = svwhilelt_b32_u64(0, block_rows); + svbool_t pg_a_1 = svwhilelt_b32_u64(svl, block_rows); + +#if (!defined(TRANSA) && !defined(UPPER)) || (defined(TRANSA) && defined(UPPER)) +#define pg_a_0_full pg_a_0 +#define pg_a_1_full pg_a_1 +#endif + svbool_t pg_b_0 = svwhilelt_b32_u64(0, block_cols); + svbool_t pg_b_1 = svwhilelt_b32_u64(svl, block_cols); + +#define pg_c_0 pg_b_0 +#define pg_c_1 pg_b_1 + + svzero_za(); + svfloat32_t alpha_vec = svdup_f32(alpha); + // Iterate through shared dimension (K) +#if (!defined(TRANSA) && defined(UPPER)) || (defined(TRANSA) && !defined(UPPER)) + for (size_t k = row_idx, valid_index = 1; k < shared_dim; k++,valid_index++) { + pg_a_0 = svwhilelt_b32_u64(0, MIN(valid_index, block_rows)); + pg_a_1 = svwhilelt_b32_u64(svl, MIN(valid_index, block_rows)); +#else + for (size_t k = 0; k < MIN(row_idx + block_rows, shared_dim); k++) { + // If k exceeds row_idx, mask out rows before (k - row_idx) + // This ensures only valid rows are included for lower triangular logic. + if (k > row_idx) { + pg_a_0 = svnot_b_z(pg_a_0_full, svwhilelt_b32_u64(0, k - row_idx)); + pg_a_1 = svnot_b_z(pg_a_1_full, svwhilelt_b32_u64(svl, k - row_idx)); + } +#endif + +#if !defined(TRANSA) + // Load column of A + svfloat32_t col_a_0 = svld1(pg_a_0, &A[k * svl]); + svfloat32_t col_a_1 = svld1(pg_a_1, &A[(k + shared_dim) * svl]); +#else + svfloat32_t col_a_0 = svld1(pg_a_0, &A[k * shared_dim]); + svfloat32_t col_a_1 = svld1(pg_a_1, &A[k * shared_dim + svl]); +#endif + col_a_0 = svmul_x(pg_a_0, alpha_vec, col_a_0); + col_a_1 = svmul_x(pg_a_1, alpha_vec, col_a_1); + // Load row of B + svfloat32_t row_b_0 = svld1(pg_b_0, &B[k * ldb]); + svfloat32_t row_b_1 = svld1(pg_b_1, &B[k * ldb + svl]); + // Perform outer product + svmopa_za32_m(/*tile*/0, pg_a_0, pg, col_a_0, row_b_0); + svmopa_za32_m(/*tile*/1, pg_a_0, pg, col_a_0, row_b_1); + svmopa_za32_m(/*tile*/2, pg_a_1, pg, col_a_1, row_b_0); + svmopa_za32_m(/*tile*/3, pg_a_1, pg, col_a_1, row_b_1); + } + + // Store to C from ZA + for (size_t i = 0; i < MIN(svl, block_rows); i++) { + svst1_hor_za32(/*tile*/0, /*slice*/i, pg_c_0, &C[i * ldc]); + svst1_hor_za32(/*tile*/1, /*slice*/i, pg_c_1, &C[i * ldc + svl]); + } + for (size_t i = svl; i < block_rows; i++) { + svst1_hor_za32(/*tile*/2, /*slice*/i, pg_c_0, &C[i * ldc]); + svst1_hor_za32(/*tile*/3, /*slice*/i, pg_c_1, &C[i * ldc + svl]); + } + +} + +__arm_new("za") __arm_locally_streaming +static inline void strmm_direct_alpha_sme1_2VLx2VL(uint64_t m, uint64_t k, uint64_t n, const float* alpha,\ + const float *ba, float *restrict bb) { + const uint64_t num_rows = m; + const uint64_t num_cols = n; + + const float *restrict a_ptr = ba; + const float *restrict b_ptr = bb; + float *restrict c_ptr = bb; + + const uint64_t svl = svcntw(); + const uint64_t svl_x2 = 2*svl; + const uint64_t ldc = n; + + + uint64_t row_idx = 0; +#if (!defined(TRANSA) && defined(UPPER)) || (defined(TRANSA) && !defined(UPPER)) + // 2x2 loop + uint64_t row_batch = svl_x2; + // Block over rows of C (panels of A) + for (; row_idx < num_rows; row_idx += row_batch) { + row_batch = MIN(row_batch, num_rows - row_idx); +#else + // Calculate the remainder of num_rows divided by 2VL to determine tail tile size + uint64_t row_batch = num_rows % svl_x2; + // If there's no remainder, use full tile size (2VL) for initial batch + if (row_batch == 0) row_batch = svl_x2; + // Loop from bottom to top, processing rows in batches + for (uint64_t index = num_rows; index > 0; index -= row_batch, row_batch = svl_x2) { + // Compute the starting row index for the current batch + row_idx = index - row_batch; +#endif + uint64_t col_idx = 0; + uint64_t col_batch = svl_x2; + // Block over column dimension of C + for (; col_idx < num_cols; col_idx += col_batch) { + col_batch = MIN(col_batch, num_cols - col_idx); +#if !defined(TRANSA) + kernel_2x2(&a_ptr[row_idx * k], &b_ptr[col_idx], + &c_ptr[row_idx * ldc + col_idx], k, + ldc, row_batch, col_batch, *alpha, row_idx); +#else + kernel_2x2(&a_ptr[row_idx], &b_ptr[col_idx], + &c_ptr[row_idx * ldc + col_idx], k, + ldc, row_batch, col_batch, *alpha, row_idx); +#endif + } + } + + return; +} + +#else +static void strmm_direct_sme1_preprocess(uint64_t nbr, uint64_t nbc, + const float *restrict a, float *restrict a_mod) {} +static void strmm_direct_alpha_sme1_2VLx2VL(uint64_t m, uint64_t k, uint64_t n, const float* alpha,\ + const float *ba, float *restrict bb){} +#endif + +void CNAME (BLASLONG M, BLASLONG N, float alpha, float * __restrict A,\ + BLASLONG strideA, float * __restrict B, BLASLONG strideB){ +#if !defined(TRANSA) + uint64_t m_mod, vl_elms; + + vl_elms = sve_cntw(); + + m_mod = ceil((double)M/(double)vl_elms) * vl_elms; + + float *A_mod = (float *) malloc(m_mod*M*sizeof(float)); + strmm_direct_sme1_preprocess(M, M, A, A_mod); + /* Calculate B = alpha*A*B*/ + strmm_direct_alpha_sme1_2VLx2VL(M, M, N, &alpha, A_mod, B); + free(A_mod); +#else + strmm_direct_alpha_sme1_2VLx2VL(M, M, N, &alpha, A, B); +#endif +} + +#else +void CNAME (BLASLONG M, BLASLONG N, float alpha, float * __restrict A,\ + BLASLONG strideA, float * __restrict B, BLASLONG strideB){ + } + +#endif diff --git a/kernel/arm64/zdot_thunderx2t99.c b/kernel/arm64/zdot_thunderx2t99.c index d48392412b..fa2eb7519f 100644 --- a/kernel/arm64/zdot_thunderx2t99.c +++ b/kernel/arm64/zdot_thunderx2t99.c @@ -32,7 +32,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #include "common.h" - +#ifdef _MSC_VER +#include +#endif #include #define N "x0" /* vector length */ @@ -197,14 +199,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(SMP) extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, - void *c, BLASLONG ldc, int (*function)(), int nthreads); + void *c, BLASLONG ldc, int (*function)(void), int nthreads); #endif static void zdot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, OPENBLAS_COMPLEX_FLOAT *result) { - FLOAT dotr = 0.0, doti = 0.0; + FLOAT dotr = 0.0, doti = 0.0; + +#ifdef _MSC_VER + CREAL(*result) = 0.0; + CIMAG(*result) = 0.0; +#else OPENBLAS_COMPLEX_FLOAT cf = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0, 0.0); - *result = cf; + *result = cf; +#endif if ( n < 0 ) return; @@ -235,8 +243,9 @@ static void zdot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLON " asr "J", "N", #"N_DIV_SHIFT" \n" " cmp "J", xzr \n" " beq 3f //dot_kernel_F1 \n" - +#ifndef _MSC_VER " .align 5 \n" +#endif "2: //dot_kernel_F: \n" " "KERNEL_F" \n" " subs "J", "J", #1 \n" @@ -297,10 +306,14 @@ static void zdot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLON "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" ); - +#ifdef _MSC_VER + CREAL(*result) = dotr; + CIMAG(*result) = doti; +#else cf=OPENBLAS_MAKE_COMPLEX_FLOAT(dotr, doti); *result = cf; - return; +#endif + return; } #if defined(SMP) @@ -320,7 +333,13 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA int nthreads; FLOAT dummy_alpha; #endif +#ifdef _MSC_VER + OPENBLAS_COMPLEX_FLOAT zdot; + CREAL(zdot) = 0.0; + CIMAG(zdot) = 0.0; +#else OPENBLAS_COMPLEX_FLOAT zdot = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); +#endif #if defined(SMP) if (inc_x == 0 || inc_y == 0 || n <= 10000) @@ -347,7 +366,11 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA ptr = (OPENBLAS_COMPLEX_FLOAT *)result; for (i = 0; i < nthreads; i++) { +#ifdef _MSC_VER + CREAL(zdot)+= CREAL(*ptr);CIMAG(zdot)+=CIMAG(*ptr); +#else zdot = OPENBLAS_MAKE_COMPLEX_FLOAT (CREAL(zdot) + CREAL(*ptr), CIMAG(zdot) + CIMAG(*ptr)); +#endif ptr = (void *)(((char *)ptr) + sizeof(double) * 2); } } diff --git a/kernel/generic/conversion_macros.h b/kernel/generic/conversion_macros.h new file mode 100644 index 0000000000..69f8520128 --- /dev/null +++ b/kernel/generic/conversion_macros.h @@ -0,0 +1,82 @@ +/*************************************************************************** + * Copyright (c) 2025, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#if defined(BFLOAT16) && defined(BFLOAT16CONVERSION) + +static float +bfloat16tof32 (bfloat16 value) +{ + blasint one = 1; + float result; + sbf16tos_(&one, &value, &one, &result, &one); + return result; +} + +#ifdef BGEMM +static bfloat16 f32tobfloat16(float value) { + blasint one = 1; + bfloat16 result; + sbstobf16_(&one, &value, &one, &result, &one); + return result; +} +#endif + +#ifdef BGEMM +#define ALPHA bfloat16tof32(alpha) +#define BETA bfloat16tof32(beta) +#define TO_F32(x) (bfloat16tof32(x)) +#define TO_OUTPUT(x) (f32tobfloat16(x)) +#else +#define ALPHA alpha +#define BETA beta +#define TO_F32(x) (bfloat16tof32(x)) +#define TO_OUTPUT(x) x +#endif + +#elif defined(HFLOAT16) + +#ifdef HGEMM +#define ALPHA (float)(alpha) +#define BETA (float)(beta) +#define TO_F32(x) ((float)(x)) +#define TO_OUTPUT(x) ((_Float16)(x)) +#else +#define ALPHA alpha +#define BETA beta +#define TO_F32(x) ((float)(x)) +#define TO_OUTPUT(x) x +#endif + +#else + +#define ALPHA alpha +#define BETA beta +#define TO_F32(x) x +#define TO_OUTPUT(x) x + +#endif diff --git a/kernel/generic/gemm_beta.c b/kernel/generic/gemm_beta.c index ccb772cc7d..74e9bf9a97 100644 --- a/kernel/generic/gemm_beta.c +++ b/kernel/generic/gemm_beta.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2025 The OpenBLAS Project. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -38,11 +39,41 @@ #include "common.h" +#if defined(BFLOAT16) && defined(BGEMM) && defined(BFLOAT16CONVERSION) +static float +bfloat16tof32 (bfloat16 f16) +{ + float result = 0; + unsigned short* q = (unsigned short*)(&result); +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + q[0] = f16; +#else + q[1] = f16; +#endif + return result; +} +static bfloat16 +f32tobfloat16(float f32) +{ + unsigned short* q = (unsigned short*)(&f32); +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + return q[0]; +#else + return q[1]; +#endif +} + +#define BF16TOF32(x) (bfloat16tof32(x)) +#define F32TOBF16(x) (f32tobfloat16(x)) +#else +#define BF16TOF32(x) x +#define F32TOBF16(x) x +#endif + int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5, FLOAT *c, BLASLONG ldc){ - BLASLONG i, j; BLASLONG chunk, remain; FLOAT *c_offset1, *c_offset; @@ -54,18 +85,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, c_offset1 = c_offset; c_offset += ldc; for(i=chunk; i>0; i--){ - *(c_offset1 + 0) = ZERO; - *(c_offset1 + 1) = ZERO; - *(c_offset1 + 2) = ZERO; - *(c_offset1 + 3) = ZERO; - *(c_offset1 + 4) = ZERO; - *(c_offset1 + 5) = ZERO; - *(c_offset1 + 6) = ZERO; - *(c_offset1 + 7) = ZERO; + *(c_offset1 + 0) = F32TOBF16(ZERO); + *(c_offset1 + 1) = F32TOBF16(ZERO); + *(c_offset1 + 2) = F32TOBF16(ZERO); + *(c_offset1 + 3) = F32TOBF16(ZERO); + *(c_offset1 + 4) = F32TOBF16(ZERO); + *(c_offset1 + 5) = F32TOBF16(ZERO); + *(c_offset1 + 6) = F32TOBF16(ZERO); + *(c_offset1 + 7) = F32TOBF16(ZERO); c_offset1 += 8; } for(i=remain; i>0; i--){ - *c_offset1 = ZERO; + *c_offset1 = F32TOBF16(ZERO); c_offset1 ++; } } @@ -74,18 +105,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, c_offset1 = c_offset; c_offset += ldc; for(i=chunk; i>0; i--){ - *(c_offset1 + 0) *= beta; - *(c_offset1 + 1) *= beta; - *(c_offset1 + 2) *= beta; - *(c_offset1 + 3) *= beta; - *(c_offset1 + 4) *= beta; - *(c_offset1 + 5) *= beta; - *(c_offset1 + 6) *= beta; - *(c_offset1 + 7) *= beta; + *(c_offset1 + 0) = F32TOBF16(BF16TOF32(beta) * BF16TOF32(c_offset1[0])); + *(c_offset1 + 1) = F32TOBF16(BF16TOF32(beta) * BF16TOF32(c_offset1[1])); + *(c_offset1 + 2) = F32TOBF16(BF16TOF32(beta) * BF16TOF32(c_offset1[2])); + *(c_offset1 + 3) = F32TOBF16(BF16TOF32(beta) * BF16TOF32(c_offset1[3])); + *(c_offset1 + 4) = F32TOBF16(BF16TOF32(beta) * BF16TOF32(c_offset1[4])); + *(c_offset1 + 5) = F32TOBF16(BF16TOF32(beta) * BF16TOF32(c_offset1[5])); + *(c_offset1 + 6) = F32TOBF16(BF16TOF32(beta) * BF16TOF32(c_offset1[6])); + *(c_offset1 + 7) = F32TOBF16(BF16TOF32(beta) * BF16TOF32(c_offset1[7])); c_offset1 += 8; } for(i=remain; i>0; i--){ - *c_offset1 *= beta; + *c_offset1 = F32TOBF16(BF16TOF32(beta) * BF16TOF32(c_offset1[0])); c_offset1 ++; } } diff --git a/kernel/generic/gemmkernel_2x2.c b/kernel/generic/gemmkernel_2x2.c index bf1c3ae381..07da2cbc87 100644 --- a/kernel/generic/gemmkernel_2x2.c +++ b/kernel/generic/gemmkernel_2x2.c @@ -1,21 +1,35 @@ +/*************************************************************************** + * Copyright (c) 2025, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + #include "common.h" -#if defined(BFLOAT16) && defined(BFLOAT16CONVERSION) -static float -bfloat16tof32 (bfloat16 f16) -{ - float result = 0; - unsigned short* q = (unsigned short*)(&result); -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - q[0] = f16; -#else - q[1] = f16; -#endif - return result; -} -#define BF16TOF32(x) (bfloat16tof32(x)) -#else -#define BF16TOF32(x) x -#endif + +#include "conversion_macros.h" + int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,IFLOAT* ba,IFLOAT* bb,FLOAT* C,BLASLONG ldc #ifdef TRMMKERNEL ,BLASLONG offset @@ -25,7 +39,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,IFLOAT* ba,IFLOAT* bb, BLASLONG i,j,k; FLOAT *C0,*C1; IFLOAT *ptrba,*ptrbb; +#ifdef BGEMM + float res0,res1,res2,res3; +#else FLOAT res0,res1,res2,res3; +#endif IFLOAT load0,load1,load2,load3,load4,load5,load6,load7; for (j=0; j typedef __vector unsigned short vec_bf16; #else diff --git a/kernel/power/sasum_microk_power10.c b/kernel/power/sasum_microk_power10.c index ea12a4264b..eba2c13ab5 100644 --- a/kernel/power/sasum_microk_power10.c +++ b/kernel/power/sasum_microk_power10.c @@ -141,7 +141,7 @@ static float sasum_kernel_32 (long n, float *x) "=wa" (t2), // 5 "=wa" (t3) // 6 : - "m" (*x) + "m" (*(const float (*)[n]) x) : "cr0", "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", diff --git a/kernel/power/sasum_microk_power8.c b/kernel/power/sasum_microk_power8.c index aa465c38e3..dad9492a91 100644 --- a/kernel/power/sasum_microk_power8.c +++ b/kernel/power/sasum_microk_power8.c @@ -159,7 +159,7 @@ static float sasum_kernel_32 (long n, float *x) "=wa" (t2), // 5 "=wa" (t3) // 6 : - "m" (*x), + "m" (*(const float (*)[n]) x), "b" (16), // 8 "b" (32), // 9 "b" (48), // 10 diff --git a/kernel/power/saxpy_microk_power10.c b/kernel/power/saxpy_microk_power10.c index cf5f459593..67787f82c4 100644 --- a/kernel/power/saxpy_microk_power10.c +++ b/kernel/power/saxpy_microk_power10.c @@ -238,13 +238,13 @@ static void saxpy_kernel_64(long n, float *x, float *y, float alpha) "#n=%1 x=%5=%2 y=%0=%3 t0=%x4\n" : - "+m" (*y), + "+m" (*(float (*)[n]) y), "+r" (n), // 1 "+b" (x), // 2 "+b" (y) // 3 : "wa" (t0), // 4 - "m" (*x) + "m" (*(const float (*)[n]) x) : "cr0", "vs32","vs33","vs34","vs35","vs36","vs37", "vs38", "vs39", diff --git a/kernel/power/scopy_microk_power8.c b/kernel/power/scopy_microk_power8.c index da39789b1a..64b4827363 100644 --- a/kernel/power/scopy_microk_power8.c +++ b/kernel/power/scopy_microk_power8.c @@ -92,12 +92,12 @@ static void scopy_kernel_32 (long n, float *x, float *y) "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : - "=m" (*y), + "=m" (*(float (*)[n]) y), "+r" (n), // 1 "+b" (x), // 2 "+b" (y) // 3 : - "m" (*x), + "m" (*(const float (*)[n]) x), "b" (16), // 5 "b" (32), // 6 "b" (48), // 7 diff --git a/kernel/power/sdot_microk_power10.c b/kernel/power/sdot_microk_power10.c index 2f028c5a0d..7c323c723c 100644 --- a/kernel/power/sdot_microk_power10.c +++ b/kernel/power/sdot_microk_power10.c @@ -122,8 +122,8 @@ static float sdot_kernel_16 (long n, float *x, float *y) "+b" (x), // 2 "+b" (y) // 3 : - "m" (*x), - "m" (*y) + "m" (*(const float (*)[n]) x), + "m" (*(const float (*)[n]) y) : "cr0", "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", diff --git a/kernel/power/sdot_microk_power8.c b/kernel/power/sdot_microk_power8.c index a8db6a8d65..be05cec8b3 100644 --- a/kernel/power/sdot_microk_power8.c +++ b/kernel/power/sdot_microk_power8.c @@ -155,8 +155,8 @@ static float sdot_kernel_16 (long n, float *x, float *y) "=wa" (t2), // 6 "=wa" (t3) // 7 : - "m" (*x), - "m" (*y), + "m" (*(const float (*)[n]) x), + "m" (*(const float (*)[n]) y), "b" (16), // 10 "b" (32), // 11 "b" (48), // 12 diff --git a/kernel/power/sgemm_kernel_power10.c b/kernel/power/sgemm_kernel_power10.c index 80f495f708..1d86b57fcc 100644 --- a/kernel/power/sgemm_kernel_power10.c +++ b/kernel/power/sgemm_kernel_power10.c @@ -245,118 +245,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, AO += 16; BO += 8; temp--; - BLASLONG K = temp / 64; + BLASLONG K = temp / 16; for (l = 0; l < K; l++) - { - vec_t *rowA = (vec_t *) & AO[0]; - vec_t *rowB = (vec_t *) & BO[0]; - KERNEL (0, 0); - KERNEL (2, 4); - KERNEL (4, 8); - KERNEL (6, 12); - KERNEL (8, 16); - KERNEL (10, 20); - KERNEL (12, 24); - KERNEL (14, 28); - KERNEL (16, 32); - KERNEL (18, 36); - KERNEL (20, 40); - KERNEL (22, 44); - KERNEL (24, 48); - KERNEL (26, 52); - KERNEL (28, 56); - KERNEL (30, 60); - KERNEL (32, 64); - KERNEL (34, 68); - KERNEL (36, 72); - KERNEL (38, 76); - KERNEL (40, 80); - KERNEL (42, 84); - KERNEL (44, 88); - KERNEL (46, 92); - KERNEL (48, 96); - KERNEL (50, 100); - KERNEL (52, 104); - KERNEL (54, 108); - KERNEL (56, 112); - KERNEL (58, 116); - KERNEL (60, 120); - KERNEL (62, 124); - KERNEL (64, 128); - KERNEL (66, 132); - KERNEL (68, 136); - KERNEL (70, 140); - KERNEL (72, 144); - KERNEL (74, 148); - KERNEL (76, 152); - KERNEL (78, 156); - KERNEL (80, 160); - KERNEL (82, 164); - KERNEL (84, 168); - KERNEL (86, 172); - KERNEL (88, 176); - KERNEL (90, 180); - KERNEL (92, 184); - KERNEL (94, 188); - KERNEL (96, 192); - KERNEL (98, 196); - KERNEL (100, 200); - KERNEL (102, 204); - KERNEL (104, 208); - KERNEL (106, 212); - KERNEL (108, 216); - KERNEL (110, 220); - KERNEL (112, 224); - KERNEL (114, 228); - KERNEL (116, 232); - KERNEL (118, 236); - KERNEL (120, 240); - KERNEL (122, 244); - KERNEL (124, 248); - KERNEL (126, 252); - AO += 1024; - BO += 512; - } - if ((temp & 63) >> 5) - { - vec_t *rowA = (vec_t *) & AO[0]; - vec_t *rowB = (vec_t *) & BO[0]; - KERNEL (0, 0); - KERNEL (2, 4); - KERNEL (4, 8); - KERNEL (6, 12); - KERNEL (8, 16); - KERNEL (10, 20); - KERNEL (12, 24); - KERNEL (14, 28); - KERNEL (16, 32); - KERNEL (18, 36); - KERNEL (20, 40); - KERNEL (22, 44); - KERNEL (24, 48); - KERNEL (26, 52); - KERNEL (28, 56); - KERNEL (30, 60); - KERNEL (32, 64); - KERNEL (34, 68); - KERNEL (36, 72); - KERNEL (38, 76); - KERNEL (40, 80); - KERNEL (42, 84); - KERNEL (44, 88); - KERNEL (46, 92); - KERNEL (48, 96); - KERNEL (50, 100); - KERNEL (52, 104); - KERNEL (54, 108); - KERNEL (56, 112); - KERNEL (58, 116); - KERNEL (60, 120); - KERNEL (62, 124); - AO += 512; - BO += 256; - } - if ((temp & 31) >> 4) { vec_t *rowA = (vec_t *) & AO[0]; vec_t *rowB = (vec_t *) & BO[0]; diff --git a/kernel/power/srot_microk_power10.c b/kernel/power/srot_microk_power10.c index c54c307424..e64ea3b36c 100644 --- a/kernel/power/srot_microk_power10.c +++ b/kernel/power/srot_microk_power10.c @@ -133,8 +133,8 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) "#n=%2 x=%0=%3 y=%1=%4 c=%5 s=%6\n" : - "+m" (*x), - "+m" (*y), + "+m" (*(float (*)[n]) x), + "+m" (*(float (*)[n]) y), "+r" (n), // 2 "+b" (x), // 3 "+b" (y) // 4 diff --git a/kernel/power/srot_microk_power8.c b/kernel/power/srot_microk_power8.c index 329a8cd069..058f4045d5 100644 --- a/kernel/power/srot_microk_power8.c +++ b/kernel/power/srot_microk_power8.c @@ -188,8 +188,8 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) "#n=%2 x=%0=%3 y=%1=%4 c=%13 s=%14 o16=%15 o32=%16 o48=%17\n" "#t0=%x5 t1=%x6 t2=%x7 t3=%x8 t4=%x9 t5=%x10 t6=%x11 t7=%x12" : - "+m" (*x), - "+m" (*y), + "+m" (*(float (*)[n]) x), + "+m" (*(float (*)[n]) y), "+r" (n), // 2 "+b" (x), // 3 "+b" (y), // 4 diff --git a/kernel/power/sscal_microk_power10.c b/kernel/power/sscal_microk_power10.c index 76703325c1..a52707a246 100644 --- a/kernel/power/sscal_microk_power10.c +++ b/kernel/power/sscal_microk_power10.c @@ -119,7 +119,7 @@ static void sscal_kernel_16 (long n, float *x, float alpha) "#n=%1 alpha=%3 x=%0=%2" : - "+m" (*x), + "+m" (*(float (*)[n]) x), "+r" (n), // 1 "+b" (x) // 2 : @@ -159,7 +159,7 @@ static void sscal_kernel_16_zero (long n, float *x) "#n=%1 x=%0=%2 " : - "=m" (*x), + "=m" (*(float (*)[n]) x), "+r" (n), // 1 "+b" (x) // 2 : diff --git a/kernel/power/sscal_microk_power8.c b/kernel/power/sscal_microk_power8.c index 88fba3166a..dbbfcb76ef 100644 --- a/kernel/power/sscal_microk_power8.c +++ b/kernel/power/sscal_microk_power8.c @@ -119,7 +119,7 @@ static void sscal_kernel_16 (long n, float *x, float alpha) "#n=%1 alpha=%3 x=%0=%2 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10" : - "+m" (*x), + "+m" (*(float (*)[n]) x), "+r" (n), // 1 "+b" (x), // 2 "+f" (alpha) // 3 @@ -166,7 +166,7 @@ static void sscal_kernel_16_zero (long n, float *x) "#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10" : - "=m" (*x), + "=m" (*(float (*)[n]) x), "+r" (n), // 1 "+b" (x), // 2 "=wa" (t0) // 3 diff --git a/kernel/power/sswap_microk_power8.c b/kernel/power/sswap_microk_power8.c index a407018a8b..5b34bf367e 100644 --- a/kernel/power/sswap_microk_power8.c +++ b/kernel/power/sswap_microk_power8.c @@ -87,8 +87,8 @@ static void sswap_kernel_32 (long n, float *x, float *y) "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : - "+m" (*x), - "+m" (*y), + "+m" (*(float (*)[n]) x), + "+m" (*(float (*)[n]) y), "+r" (n), // 2 "+b" (x), // 3 "+b" (y) // 4 diff --git a/kernel/power/swap_microk_power10.c b/kernel/power/swap_microk_power10.c index 118adee5fd..29df220912 100644 --- a/kernel/power/swap_microk_power10.c +++ b/kernel/power/swap_microk_power10.c @@ -143,8 +143,8 @@ static void sswap_kernel_32 (long n, float *x, float *y) "#n=%2 x=%0=%3 y=%1=%4" : - "+m" (*x), - "+m" (*y), + "+m" (*(FLOAT (*)[n]) x), + "+m" (*(FLOAT (*)[n]) y), "+r" (n), // 2 "+b" (x), // 3 "+b" (y) // 4 diff --git a/kernel/power/zasum_microk_power8.c b/kernel/power/zasum_microk_power8.c index 3f0af42321..8ec7c307fe 100644 --- a/kernel/power/zasum_microk_power8.c +++ b/kernel/power/zasum_microk_power8.c @@ -154,7 +154,7 @@ static double zasum_kernel_8 (long n, double *x) "=wa" (t2), // 5 "=wa" (t3) // 6 : - "m" (*x), + "m" (*(const double (*)[n * 2]) x), "b" (16), // 8 "b" (32), // 9 "b" (48), // 10 diff --git a/kernel/power/zaxpy_microk_power10.c b/kernel/power/zaxpy_microk_power10.c index 366c7ed62a..44a322e439 100644 --- a/kernel/power/zaxpy_microk_power10.c +++ b/kernel/power/zaxpy_microk_power10.c @@ -210,7 +210,7 @@ static void zaxpy_kernel_4 (long n, double *x, double *y, "#n=%1 x=%13=%2 y=%0=%3 alpha=(%15,%16) mvecp=%14=%17 ytmp=%12\n" "#t0=%x4 t1=%x5 t2=%x6 t3=%x7 t4=%x8 t5=%x9 t6=%x10 t7=%x11" : - "+m" (*y), + "+m" (*(double (*)[n * 2]) y), "+r" (n), // 1 "+b" (x), // 2 "+b" (y), // 3 @@ -224,8 +224,9 @@ static void zaxpy_kernel_4 (long n, double *x, double *y, "=wa" (t7), // 11 "=b" (ytmp) // 12 : - "m" (*x), - "m" (*mvecp), + "m" (*(const double (*)[n * 2]) x), + "m" (*(const double (*)[2]) mvecp), + "d" (alpha_r), // 15 "d" (alpha_i), // 16 "12" (mvecp) // 17 diff --git a/kernel/power/zaxpy_microk_power8.c b/kernel/power/zaxpy_microk_power8.c index 959050e5f1..e24b0c3274 100644 --- a/kernel/power/zaxpy_microk_power8.c +++ b/kernel/power/zaxpy_microk_power8.c @@ -224,7 +224,7 @@ static void zaxpy_kernel_4 (long n, double *x, double *y, "#n=%1 x=%17=%2 y=%0=%3 alpha=(%19,%20) mvecp=%18=%16 o16=%22 o32=%23 o48=%24 ytmp=%16\n" "#t0=%x4 t1=%x5 t2=%x6 t3=%x7 t4=%x8 t5=%x9 t6=%x10 t7=%x11 t8=%x12 t9=%x13 t10=%x14 t11=%x15" : - "+m" (*y), + "+m" (*(double (*)[n * 2]) y), "+r" (n), // 1 "+b" (x), // 2 "+b" (y), // 3 @@ -242,8 +242,8 @@ static void zaxpy_kernel_4 (long n, double *x, double *y, "=wa" (t11), // 15 "=b" (ytmp) // 16 : - "m" (*x), - "m" (*mvecp), + "m" (*(const double (*)[n * 2]) x), + "m" (*(const double (*)[2]) mvecp), "d" (alpha_r), // 19 "d" (alpha_i), // 20 "16" (mvecp), // 21 diff --git a/kernel/power/zcopy_microk_power8.c b/kernel/power/zcopy_microk_power8.c index e295470477..8ae3f8bb7d 100644 --- a/kernel/power/zcopy_microk_power8.c +++ b/kernel/power/zcopy_microk_power8.c @@ -134,12 +134,12 @@ static void zcopy_kernel_16 (long n, FLOAT *x, FLOAT *y) "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : - "=m" (*y), + "=m" (*(FLOAT (*)[n * 2]) y), "+r" (n), // 1 "+b" (x), // 2 "+b" (y) // 3 : - "m" (*x), + "m" (*(const FLOAT (*)[n * 2]) x), "b" (16), // 5 "b" (32), // 6 "b" (48), // 7 diff --git a/kernel/power/zdot_microk_power8.c b/kernel/power/zdot_microk_power8.c index dcde824330..584d9d4489 100644 --- a/kernel/power/zdot_microk_power8.c +++ b/kernel/power/zdot_microk_power8.c @@ -186,13 +186,13 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot) "#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6 o16=%7 o32=%8 o48=%9" : - "=m" (*dot), + "=m" (*(double (*)[4]) dot), "+r" (n), // 1 "+b" (x), // 2 "+b" (y) // 3 : - "m" (*x), - "m" (*y), + "m" (*(double (*)[n * 2]) x), + "m" (*(double (*)[n * 2]) y), "b" (dot), // 6 "b" (16), // 7 "b" (32), // 8 diff --git a/kernel/power/zscal_microk_power10.c b/kernel/power/zscal_microk_power10.c index af99b86484..2f350cefef 100644 --- a/kernel/power/zscal_microk_power10.c +++ b/kernel/power/zscal_microk_power10.c @@ -201,7 +201,7 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) #endif "#n=%1 x=%0=%2 alpha=(%9,%10) \n" : - "+m" (*x), + "+m" (*(double (*)[n * 2]) x), "+r" (n), // 1 "+b" (x), // 2 "=wa" (t0), // 3 diff --git a/kernel/power/zscal_microk_power8.c b/kernel/power/zscal_microk_power8.c index 567331775a..7a7f52a2eb 100644 --- a/kernel/power/zscal_microk_power8.c +++ b/kernel/power/zscal_microk_power8.c @@ -205,7 +205,7 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) "#n=%1 x=%0=%2 alpha=(%15,%16) o16=%17 o32=%18 o48=%19 o64=%20 o80=%21 o96=%22 o112=%23\n" "#t0=%x3 t1=%x4 t2=%x5 t3=%x6 t4=%x7 t5=%x8 t6=%x9 t7=%x10 t8=%x11 t9=%x12 t10=%x13 t11=%x14" : - "+m" (*x), + "+m" (*(double (*)[n * 2]) x), "+r" (n), // 1 "+b" (x), // 2 "=wa" (t0), // 3 diff --git a/kernel/power/zswap_microk_power8.c b/kernel/power/zswap_microk_power8.c index 1e9fbe2cff..78e616521d 100644 --- a/kernel/power/zswap_microk_power8.c +++ b/kernel/power/zswap_microk_power8.c @@ -134,8 +134,8 @@ zswap_kernel_16 (long n, double *x, double *y) "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : - "+m" (*x), - "+m" (*y), + "+m" (*(double (*)[n * 2]) x), + "+m" (*(double (*)[n * 2]) y), "+r" (n), // 2 "+b" (x), // 3 "+b" (y) // 4 diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL128B b/kernel/riscv64/KERNEL.RISCV64_ZVL128B index 7fbc26d213..ad7db5622e 100644 --- a/kernel/riscv64/KERNEL.RISCV64_ZVL128B +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL128B @@ -96,27 +96,45 @@ CGEMVTKERNEL = zgemv_t_rvv.c ZGEMVTKERNEL = zgemv_t_rvv.c SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c +ifneq ($(filter $(SGEMM_UNROLL_N),4 8 16),) +SGEMMONCOPY = gemm_ncopy_$(SGEMM_UNROLL_N)_rvv.c +SGEMMOTCOPY = gemm_tcopy_$(SGEMM_UNROLL_N)_rvv.c +else SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +endif SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +ifneq ($(filter $(SGEMM_UNROLL_M),4 8 16),) +SGEMMINCOPY = gemm_ncopy_$(SGEMM_UNROLL_M)_rvv.c +SGEMMITCOPY = gemm_tcopy_$(SGEMM_UNROLL_M)_rvv.c +else SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +endif SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) endif DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +ifneq ($(filter $(DGEMM_UNROLL_N),4 8 16),) +DGEMMOTCOPY = gemm_tcopy_$(DGEMM_UNROLL_N)_rvv.c +else DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +ifneq ($(filter $(DGEMM_UNROLL_M),4 8 16),) +DGEMMITCOPY = gemm_tcopy_$(DGEMM_UNROLL_M)_rvv.c +else DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) endif @@ -245,3 +263,38 @@ endif ifndef ZGEMM_BETA ZGEMM_BETA = zgemm_beta_rvv.c endif + +ifeq ($(BUILD_HFLOAT16), 1) +SHGEMMKERNEL = shgemm_kernel_$(SHGEMM_UNROLL_M)x$(SHGEMM_UNROLL_N)_zvl128b.c +ifneq ($(filter $(SHGEMM_UNROLL_N),8 16),) +SHGEMMONCOPY = gemm_ncopy_$(SHGEMM_UNROLL_N)fp_rvv.c +SHGEMMOTCOPY = gemm_tcopy_$(SHGEMM_UNROLL_N)fp_rvv.c +else +SHGEMMONCOPY = ../generic/gemm_ncopy_$(SHGEMM_UNROLL_N).c +SHGEMMOTCOPY = ../generic/gemm_tcopy_$(SHGEMM_UNROLL_N).c +endif +SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) +SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) +ifndef SHGEMM_BETA +SHGEMM_BETA = gemm_beta_rvv.c +endif +endif + +ifeq ($(BUILD_BFLOAT16), 1) +SBGEMMKERNEL = sbgemm_kernel_$(SBGEMM_UNROLL_M)x$(SBGEMM_UNROLL_N)_zvl128b.c +ifneq ($(filter $(SBGEMM_UNROLL_N),8 16),) +SBGEMMONCOPY = gemm_ncopy_$(SBGEMM_UNROLL_N)fp_rvv.c +SBGEMMOTCOPY = gemm_tcopy_$(SBGEMM_UNROLL_N)fp_rvv.c +else +SBGEMMONCOPY = ../generic/gemm_ncopy_$(SBGEMM_UNROLL_N).c +SBGEMMOTCOPY = ../generic/gemm_tcopy_$(SBGEMM_UNROLL_N).c +endif +SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) +SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) +ifndef SBGEMM_BETA +SBGEMM_BETA = gemm_beta_rvv.c +endif +endif + +DOMATCOPY_CT = omatcopy_ct_rvv.c +SOMATCOPY_CT = omatcopy_ct_rvv.c diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL256B b/kernel/riscv64/KERNEL.RISCV64_ZVL256B index 2b4f0a5455..c48095bb21 100644 --- a/kernel/riscv64/KERNEL.RISCV64_ZVL256B +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL256B @@ -96,25 +96,43 @@ CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c +ifneq ($(filter $(SGEMM_UNROLL_N),4 8 16),) +SGEMMONCOPY = gemm_ncopy_$(SGEMM_UNROLL_N)_rvv.c +SGEMMOTCOPY = gemm_tcopy_$(SGEMM_UNROLL_N)_rvv.c +else SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +endif SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +ifneq ($(filter $(SGEMM_UNROLL_M),4 8 16),) +SGEMMINCOPY = gemm_ncopy_$(SGEMM_UNROLL_M)_rvv.c +SGEMMITCOPY = gemm_tcopy_$(SGEMM_UNROLL_M)_rvv.c +else SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +endif SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) endif DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +ifneq ($(filter $(DGEMM_UNROLL_N),4 8 16),) +DGEMMOTCOPY = gemm_tcopy_$(DGEMM_UNROLL_N)_rvv.c +else DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +ifneq ($(filter $(DGEMM_UNROLL_M),4 8 16),) +DGEMMITCOPY = gemm_tcopy_$(DGEMM_UNROLL_M)_rvv.c +else DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) endif @@ -209,5 +227,67 @@ COMATCOPY_CN = zomatcopy_cn_vector.c DOMATCOPY_CN = omatcopy_cn_vector.c SOMATCOPY_CN = omatcopy_cn_vector.c +DOMATCOPY_CT = omatcopy_ct_rvv.c +SOMATCOPY_CT = omatcopy_ct_rvv.c + + +ifeq ($(BUILD_HFLOAT16), 1) +SHGEMMKERNEL = shgemm_kernel_$(SHGEMM_UNROLL_M)x$(SHGEMM_UNROLL_N)_zvl256b.c +ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) +ifneq ($(filter $(SHGEMM_UNROLL_M),8 16),) +SHGEMMINCOPY = gemm_ncopy_$(SHGEMM_UNROLL_M)fp_rvv.c +SHGEMMITCOPY = gemm_tcopy_$(SHGEMM_UNROLL_M)fp_rvv.c +else +SHGEMMINCOPY = ../generic/gemm_ncopy_$(SHGEMM_UNROLL_M).c +SHGEMMITCOPY = ../generic/gemm_tcopy_$(SHGEMM_UNROLL_M).c +endif +SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) +SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ifneq ($(filter $(SHGEMM_UNROLL_N),8 16),) +SHGEMMONCOPY = gemm_ncopy_$(SHGEMM_UNROLL_N)fp_rvv.c +SHGEMMOTCOPY = gemm_tcopy_$(SHGEMM_UNROLL_N)fp_rvv.c +else +SHGEMMONCOPY = ../generic/gemm_ncopy_$(SHGEMM_UNROLL_N).c +SHGEMMOTCOPY = ../generic/gemm_tcopy_$(SHGEMM_UNROLL_N).c +endif +SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) +SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) +ifndef SHGEMM_BETA +SHGEMM_BETA = gemm_beta_rvv.c +endif +SHGEMVNKERNEL = sbgemv_n_vector.c +SHGEMVTKERNEL = sbgemv_t_vector.c +endif + +ifeq ($(BUILD_BFLOAT16), 1) +SBGEMMKERNEL = sbgemm_kernel_$(SBGEMM_UNROLL_M)x$(SBGEMM_UNROLL_N)_zvl256b.c +ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N)) +ifneq ($(filter $(SBGEMM_UNROLL_M),8 16),) +SBGEMMINCOPY = gemm_ncopy_$(SBGEMM_UNROLL_M)fp_rvv.c +SBGEMMITCOPY = gemm_tcopy_$(SBGEMM_UNROLL_M)fp_rvv.c +else +SBGEMMINCOPY = ../generic/gemm_ncopy_$(SBGEMM_UNROLL_M).c +SBGEMMITCOPY = ../generic/gemm_tcopy_$(SBGEMM_UNROLL_M).c +endif +SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) +SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ifneq ($(filter $(SBGEMM_UNROLL_N),8 16),) +SBGEMMONCOPY = gemm_ncopy_$(SBGEMM_UNROLL_N)fp_rvv.c +SBGEMMOTCOPY = gemm_tcopy_$(SBGEMM_UNROLL_N)fp_rvv.c +else +SBGEMMONCOPY = ../generic/gemm_ncopy_$(SBGEMM_UNROLL_N).c +SBGEMMOTCOPY = ../generic/gemm_tcopy_$(SBGEMM_UNROLL_N).c +endif +SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) +SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) +ifndef SBGEMM_BETA +SBGEMM_BETA = gemm_beta_rvv.c +endif +SBGEMVNKERNEL = sbgemv_n_vector.c +SBGEMVTKERNEL = sbgemv_t_vector.c +endif + SAXPBYKERNEL = axpby_vector_v2.c DAXPBYKERNEL = axpby_vector_v2.c diff --git a/kernel/riscv64/gemm_ncopy_16_rvv.c b/kernel/riscv64/gemm_ncopy_16_rvv.c new file mode 100644 index 0000000000..2138133bdd --- /dev/null +++ b/kernel/riscv64/gemm_ncopy_16_rvv.c @@ -0,0 +1,325 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define FLOAT_VX2_T vfloat32m1x2_t +#define FLOAT_VX4_T vfloat32m1x4_t +#define FLOAT_VX8_T vfloat32m1x8_t +#define VLSEG_FLOAT __riscv_vlse32_v_f32m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1x2 +#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1x4 +#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1x8 +#define VGET_VX2 __riscv_vget_v_f32m1x2_f32m1 +#define VGET_VX4 __riscv_vget_v_f32m1x4_f32m1 +#define VGET_VX8 __riscv_vget_v_f32m1x8_f32m1 +#define VSET_VX2 __riscv_vset_v_f32m1_f32m1x2 +#define VSET_VX4 __riscv_vset_v_f32m1_f32m1x4 +#define VSET_VX8 __riscv_vset_v_f32m1_f32m1x8 +#define VLEV_FLOAT __riscv_vle32_v_f32m1 +#define VSEV_FLOAT __riscv_vse32_v_f32m1 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 +#else +#define VSETVL(n) __riscv_vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define FLOAT_VX2_T vfloat64m1x2_t +#define FLOAT_VX4_T vfloat64m1x4_t +#define FLOAT_VX8_T vfloat64m1x8_t +#define VLSEG_FLOAT __riscv_vlse64_v_f64m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1x2 +#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1x4 +#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1x8 +#define VGET_VX2 __riscv_vget_v_f64m1x2_f64m1 +#define VGET_VX4 __riscv_vget_v_f64m1x4_f64m1 +#define VGET_VX8 __riscv_vget_v_f64m1x8_f64m1 +#define VSET_VX2 __riscv_vset_v_f64m1_f64m1x2 +#define VSET_VX4 __riscv_vset_v_f64m1_f64m1x4 +#define VSET_VX8 __riscv_vset_v_f64m1_f64m1x8 +#define VLEV_FLOAT __riscv_vle64_v_f64m1 +#define VSEV_FLOAT __riscv_vse64_v_f64m1 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 +#endif + +// Optimizes the implementation in ../generic/gemm_ncopy_16.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) +{ + BLASLONG i, j; + + FLOAT *a_offset; + FLOAT *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *a_offset5, *a_offset6, *a_offset7, *a_offset8; + FLOAT *b_offset; + + FLOAT_V_T v1, v2, v3, v4, v5, v6, v7, v8; + FLOAT_V_T v9, v10, v11, v12, v13, v14, v15, v16; + FLOAT_VX2_T vx2, vx21; + FLOAT_VX4_T vx4, vx41; + FLOAT_VX8_T vx8, vx81; + + size_t vl; + + //fprintf(stderr, "gemm_ncopy_16 m=%ld n=%ld lda=%ld\n", m, n, lda); + + a_offset = a; + b_offset = b; + + j = (n >> 4); + if (j) { + vl = VSETVL(8); + + do { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda * 8; + a_offset += 16 * lda; + + i = m >> 3; + if (i) { + do { + vx8 = VLSSEG8_FLOAT(a_offset1, lda * sizeof(FLOAT), vl); + vx81 = VLSSEG8_FLOAT(a_offset2, lda * sizeof(FLOAT), vl); + + v1 = VGET_VX8(vx8, 0); + v2 = VGET_VX8(vx8, 1); + v3 = VGET_VX8(vx8, 2); + v4 = VGET_VX8(vx8, 3); + v5 = VGET_VX8(vx8, 4); + v6 = VGET_VX8(vx8, 5); + v7 = VGET_VX8(vx8, 6); + v8 = VGET_VX8(vx8, 7); + v9 = VGET_VX8(vx81, 0); + v10 = VGET_VX8(vx81, 1); + v11 = VGET_VX8(vx81, 2); + v12 = VGET_VX8(vx81, 3); + v13 = VGET_VX8(vx81, 4); + v14 = VGET_VX8(vx81, 5); + v15 = VGET_VX8(vx81, 6); + v16 = VGET_VX8(vx81, 7); + + VSEV_FLOAT(b_offset, v1, vl); + VSEV_FLOAT(b_offset + 8, v9, vl); + VSEV_FLOAT(b_offset + 16, v2, vl); + VSEV_FLOAT(b_offset + 24, v10, vl); + VSEV_FLOAT(b_offset + 32, v3, vl); + VSEV_FLOAT(b_offset + 40, v11, vl); + VSEV_FLOAT(b_offset + 48, v4, vl); + VSEV_FLOAT(b_offset + 56, v12, vl); + VSEV_FLOAT(b_offset + 64, v5, vl); + VSEV_FLOAT(b_offset + 72, v13, vl); + VSEV_FLOAT(b_offset + 80, v6, vl); + VSEV_FLOAT(b_offset + 88, v14, vl); + VSEV_FLOAT(b_offset + 96, v7, vl); + VSEV_FLOAT(b_offset + 104, v15, vl); + VSEV_FLOAT(b_offset + 112, v8, vl); + VSEV_FLOAT(b_offset + 120, v16, vl); + + a_offset1 += 8; + a_offset2 += 8; + b_offset += 128; + } while (--i); + } + + if (m & 4) { + vx4 = VLSSEG4_FLOAT(a_offset1, lda * sizeof(FLOAT), vl); + vx41 = VLSSEG4_FLOAT(a_offset2, lda * sizeof(FLOAT), vl); + + v1 = VGET_VX4(vx4, 0); + v2 = VGET_VX4(vx4, 1); + v3 = VGET_VX4(vx4, 2); + v4 = VGET_VX4(vx4, 3); + v5 = VGET_VX4(vx41, 0); + v6 = VGET_VX4(vx41, 1); + v7 = VGET_VX4(vx41, 2); + v8 = VGET_VX4(vx41, 3); + + VSEV_FLOAT(b_offset, v1, vl); + VSEV_FLOAT(b_offset + 8, v5, vl); + VSEV_FLOAT(b_offset + 16, v2, vl); + VSEV_FLOAT(b_offset + 24, v6, vl); + VSEV_FLOAT(b_offset + 32, v3, vl); + VSEV_FLOAT(b_offset + 40, v7, vl); + VSEV_FLOAT(b_offset + 48, v4, vl); + VSEV_FLOAT(b_offset + 56, v8, vl); + + a_offset1 += 4; + a_offset2 += 4; + b_offset += 64; + } + + if (m & 2) { + vx2 = VLSSEG2_FLOAT(a_offset1, lda * sizeof(FLOAT), vl); + vx21 = VLSSEG2_FLOAT(a_offset2, lda * sizeof(FLOAT), vl); + + v1 = VGET_VX2(vx2, 0); + v2 = VGET_VX2(vx2, 1); + v3 = VGET_VX2(vx21, 0); + v4 = VGET_VX2(vx21, 1); + + VSEV_FLOAT(b_offset, v1, vl); + VSEV_FLOAT(b_offset + 8, v3, vl); + VSEV_FLOAT(b_offset + 16, v2, vl); + VSEV_FLOAT(b_offset + 24, v4, vl); + + a_offset1 += 2; + a_offset2 += 2; + b_offset += 32; + } + + if (m & 1) { + v1 = VLSEG_FLOAT(a_offset1, lda * sizeof(FLOAT), vl); + v2 = VLSEG_FLOAT(a_offset2, lda * sizeof(FLOAT), vl); + + VSEV_FLOAT(b_offset, v1, vl); + VSEV_FLOAT(b_offset + 8, v2, vl); + + b_offset += 16; + } + } while (--j); + } + + if (n & 8) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset5 = a_offset4 + lda; + a_offset6 = a_offset5 + lda; + a_offset7 = a_offset6 + lda; + a_offset8 = a_offset7 + lda; + a_offset += 8 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + v3 = VLEV_FLOAT(a_offset3, vl); + v4 = VLEV_FLOAT(a_offset4, vl); + v5 = VLEV_FLOAT(a_offset5, vl); + v6 = VLEV_FLOAT(a_offset6, vl); + v7 = VLEV_FLOAT(a_offset7, vl); + v8 = VLEV_FLOAT(a_offset8, vl); + + vx8 = VSET_VX8(vx8, 0, v1); + vx8 = VSET_VX8(vx8, 1, v2); + vx8 = VSET_VX8(vx8, 2, v3); + vx8 = VSET_VX8(vx8, 3, v4); + vx8 = VSET_VX8(vx8, 4, v5); + vx8 = VSET_VX8(vx8, 5, v6); + vx8 = VSET_VX8(vx8, 6, v7); + vx8 = VSET_VX8(vx8, 7, v8); + + VSSEG8_FLOAT(b_offset, vx8, vl); + + a_offset1 += vl; + a_offset2 += vl; + a_offset3 += vl; + a_offset4 += vl; + a_offset5 += vl; + a_offset6 += vl; + a_offset7 += vl; + a_offset8 += vl; + b_offset += vl*8; + } + } + + if (n & 4) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + v3 = VLEV_FLOAT(a_offset3, vl); + v4 = VLEV_FLOAT(a_offset4, vl); + + vx4 = VSET_VX4(vx4, 0, v1); + vx4 = VSET_VX4(vx4, 1, v2); + vx4 = VSET_VX4(vx4, 2, v3); + vx4 = VSET_VX4(vx4, 3, v4); + + VSSEG4_FLOAT(b_offset, vx4, vl); + + a_offset1 += vl; + a_offset2 += vl; + a_offset3 += vl; + a_offset4 += vl; + b_offset += vl*4; + } + } + + if (n & 2) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + + vx2 = VSET_VX2(vx2, 0, v1); + vx2 = VSET_VX2(vx2, 1, v2); + + VSSEG2_FLOAT(b_offset, vx2, vl); + + a_offset1 += vl; + a_offset2 += vl; + b_offset += vl*2; + } + } + + if (n & 1) { + a_offset1 = a_offset; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + + VSEV_FLOAT(b_offset, v1, vl); + + a_offset1 += vl; + b_offset += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_ncopy_16fp_rvv.c b/kernel/riscv64/gemm_ncopy_16fp_rvv.c new file mode 100644 index 0000000000..c67c5ef206 --- /dev/null +++ b/kernel/riscv64/gemm_ncopy_16fp_rvv.c @@ -0,0 +1,345 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#define VSETVL(n) __riscv_vsetvl_e16m1(n) +#define VSETVL2(n) __riscv_vsetvl_e16m2(n) +#define VSETVL4(n) __riscv_vsetvl_e16m4(n) +#define VSETVL8(n) __riscv_vsetvl_e16m8(n) +#if defined(HFLOAT16) +#define FLOAT_V_T vfloat16m1_t +#define FLOAT_V2_T vfloat16m2_t +#define FLOAT_V4_T vfloat16m4_t +#define FLOAT_V8_T vfloat16m8_t +#define FLOAT_VX2_T vfloat16m1x2_t +#define FLOAT_VX4_T vfloat16m1x4_t +#define FLOAT_VX8_T vfloat16m1x8_t +#define FLOAT_VX24_T vfloat16m4x2_t +#define FLOAT_VX42_T vfloat16m2x4_t +#define VLSEG2_FLOAT __riscv_vlse16_v_f16m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e16_v_f16m1x2 +#define VLSSEG4_FLOAT __riscv_vlsseg4e16_v_f16m1x4 +#define VLSSEG8_FLOAT __riscv_vlsseg8e16_v_f16m1x8 +#define VGET_VX2 __riscv_vget_v_f16m1x2_f16m1 +#define VGET_VX4 __riscv_vget_v_f16m1x4_f16m1 +#define VGET_VX8 __riscv_vget_v_f16m1x8_f16m1 +#define VSET_VX2 __riscv_vset_v_f16m4_f16m4x2 +#define VSET_VX4 __riscv_vset_v_f16m2_f16m2x4 +#define VSET_VX8 __riscv_vset_v_f16m1_f16m1x8 +#define VLEV_FLOAT __riscv_vle16_v_f16m1 +#define VLEV_FLOAT2 __riscv_vle16_v_f16m2 +#define VLEV_FLOAT4 __riscv_vle16_v_f16m4 +#define VLEV_FLOAT8 __riscv_vle16_v_f16m8 +#define VSEV_FLOAT __riscv_vse16_v_f16m1 +#define VSEV_FLOAT2 __riscv_vse16_v_f16m2 +#define VSEV_FLOAT8 __riscv_vse16_v_f16m8 +#define VSSEG2_FLOAT __riscv_vsseg2e16_v_f16m4x2 +#define VSSEG4_FLOAT __riscv_vsseg4e16_v_f16m2x4 +#define VSSEG8_FLOAT __riscv_vsseg8e16_v_f16m1x8 +#else +#define FLOAT_V_T vbfloat16m1_t +#define FLOAT_V2_T vbfloat16m2_t +#define FLOAT_V4_T vbfloat16m4_t +#define FLOAT_V8_T vbfloat16m8_t +#define FLOAT_VX2_T vbfloat16m1x2_t +#define FLOAT_VX4_T vbfloat16m1x4_t +#define FLOAT_VX8_T vbfloat16m1x8_t +#define FLOAT_VX24_T vbfloat16m4x2_t +#define FLOAT_VX42_T vbfloat16m2x4_t +#define VLSEG2_FLOAT __riscv_vlse16_v_bf16m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e16_v_bf16m1x2 +#define VLSSEG4_FLOAT __riscv_vlsseg4e16_v_bf16m1x4 +#define VLSSEG8_FLOAT __riscv_vlsseg8e16_v_bf16m1x8 +#define VGET_VX2 __riscv_vget_v_bf16m1x2_bf16m1 +#define VGET_VX4 __riscv_vget_v_bf16m1x4_bf16m1 +#define VGET_VX8 __riscv_vget_v_bf16m1x8_bf16m1 +#define VSET_VX2 __riscv_vset_v_bf16m4_bf16m4x2 +#define VSET_VX4 __riscv_vset_v_bf16m2_bf16m2x4 +#define VSET_VX8 __riscv_vset_v_bf16m1_bf16m1x8 +#define VLEV_FLOAT __riscv_vle16_v_bf16m1 +#define VLEV_FLOAT2 __riscv_vle16_v_bf16m2 +#define VLEV_FLOAT4 __riscv_vle16_v_bf16m4 +#define VLEV_FLOAT8 __riscv_vle16_v_bf16m8 +#define VSEV_FLOAT __riscv_vse16_v_bf16m1 +#define VSEV_FLOAT2 __riscv_vse16_v_bf16m2 +#define VSEV_FLOAT8 __riscv_vse16_v_bf16m8 +#define VSSEG2_FLOAT __riscv_vsseg2e16_v_bf16m4x2 +#define VSSEG4_FLOAT __riscv_vsseg4e16_v_bf16m2x4 +#define VSSEG8_FLOAT __riscv_vsseg8e16_v_bf16m1x8 +#endif + +// Optimizes the implementation in ../generic/gemm_ncopy_16.c + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + + IFLOAT *a_offset; + IFLOAT *a_offset1, *a_offset2, *a_offset3, *a_offset4; + IFLOAT *a_offset5, *a_offset6, *a_offset7, *a_offset8; + IFLOAT *b_offset; + + FLOAT_V_T v1, v2, v3, v4, v5, v6, v7, v8; + FLOAT_V_T v9, v10, v11, v12, v13, v14, v15, v16; + FLOAT_V2_T v21, v22, v23, v24; + FLOAT_V4_T v41, v42; + FLOAT_V8_T v81; + + FLOAT_VX2_T vx2, vx21; + FLOAT_VX4_T vx4, vx41; + FLOAT_VX8_T vx8, vx81; + FLOAT_VX42_T vx24; + FLOAT_VX24_T vx42; + + size_t vl; + + //fprintf(stderr, "gemm_ncopy_16 m=%ld n=%ld lda=%ld\n", m, n, lda); + + a_offset = a; + b_offset = b; + + for (j = (n >> 4); j > 0; j--) { + vl = VSETVL(8); + + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda * 8; + a_offset += 16 * lda; + + for (i = m >> 3; i > 0; i--) { + vx8 = VLSSEG8_FLOAT(a_offset1, lda * sizeof(IFLOAT), vl); + vx81 = VLSSEG8_FLOAT(a_offset2, lda * sizeof(IFLOAT), vl); + + v1 = VGET_VX8(vx8, 0); + v2 = VGET_VX8(vx8, 1); + v3 = VGET_VX8(vx8, 2); + v4 = VGET_VX8(vx8, 3); + v5 = VGET_VX8(vx8, 4); + v6 = VGET_VX8(vx8, 5); + v7 = VGET_VX8(vx8, 6); + v8 = VGET_VX8(vx8, 7); + v9 = VGET_VX8(vx81, 0); + v10 = VGET_VX8(vx81, 1); + v11 = VGET_VX8(vx81, 2); + v12 = VGET_VX8(vx81, 3); + v13 = VGET_VX8(vx81, 4); + v14 = VGET_VX8(vx81, 5); + v15 = VGET_VX8(vx81, 6); + v16 = VGET_VX8(vx81, 7); + + VSEV_FLOAT(b_offset, v1, vl); + VSEV_FLOAT(b_offset + 8, v9, vl); + VSEV_FLOAT(b_offset + 16, v2, vl); + VSEV_FLOAT(b_offset + 24, v10, vl); + VSEV_FLOAT(b_offset + 32, v3, vl); + VSEV_FLOAT(b_offset + 40, v11, vl); + VSEV_FLOAT(b_offset + 48, v4, vl); + VSEV_FLOAT(b_offset + 56, v12, vl); + VSEV_FLOAT(b_offset + 64, v5, vl); + VSEV_FLOAT(b_offset + 72, v13, vl); + VSEV_FLOAT(b_offset + 80, v6, vl); + VSEV_FLOAT(b_offset + 88, v14, vl); + VSEV_FLOAT(b_offset + 96, v7, vl); + VSEV_FLOAT(b_offset + 104, v15, vl); + VSEV_FLOAT(b_offset + 112, v8, vl); + VSEV_FLOAT(b_offset + 120, v16, vl); + + a_offset1 += 8; + a_offset2 += 8; + b_offset += 128; + } + + if (m & 4) { + vx4 = VLSSEG4_FLOAT(a_offset1, lda * sizeof(IFLOAT), vl); + vx41 = VLSSEG4_FLOAT(a_offset2, lda * sizeof(IFLOAT), vl); + + v1 = VGET_VX4(vx4, 0); + v2 = VGET_VX4(vx4, 1); + v3 = VGET_VX4(vx4, 2); + v4 = VGET_VX4(vx4, 3); + v5 = VGET_VX4(vx41, 0); + v6 = VGET_VX4(vx41, 1); + v7 = VGET_VX4(vx41, 2); + v8 = VGET_VX4(vx41, 3); + + VSEV_FLOAT(b_offset, v1, vl); + VSEV_FLOAT(b_offset + 8, v5, vl); + VSEV_FLOAT(b_offset + 16, v2, vl); + VSEV_FLOAT(b_offset + 24, v6, vl); + VSEV_FLOAT(b_offset + 32, v3, vl); + VSEV_FLOAT(b_offset + 40, v7, vl); + VSEV_FLOAT(b_offset + 48, v4, vl); + VSEV_FLOAT(b_offset + 56, v8, vl); + + a_offset1 += 4; + a_offset2 += 4; + b_offset += 64; + } + + if (m & 2) { + vx2 = VLSSEG2_FLOAT(a_offset1, lda * sizeof(IFLOAT), vl); + vx21 = VLSSEG2_FLOAT(a_offset2, lda * sizeof(IFLOAT), vl); + + v1 = VGET_VX2(vx2, 0); + v2 = VGET_VX2(vx2, 1); + v3 = VGET_VX2(vx21, 0); + v4 = VGET_VX2(vx21, 1); + + VSEV_FLOAT(b_offset, v1, vl); + VSEV_FLOAT(b_offset + 8, v3, vl); + VSEV_FLOAT(b_offset + 16, v2, vl); + VSEV_FLOAT(b_offset + 24, v4, vl); + + a_offset1 += 2; + a_offset2 += 2; + b_offset += 32; + } + + if (m & 1) { + v21 = VLSEG2_FLOAT(a_offset1, lda * sizeof(IFLOAT), vl * 2); + + VSEV_FLOAT2(b_offset, v21, vl * 2); + + b_offset += 16; + } + } + + if (n & 8) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset5 = a_offset4 + lda; + a_offset6 = a_offset5 + lda; + a_offset7 = a_offset6 + lda; + a_offset8 = a_offset7 + lda; + a_offset += 8 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + v3 = VLEV_FLOAT(a_offset3, vl); + v4 = VLEV_FLOAT(a_offset4, vl); + v5 = VLEV_FLOAT(a_offset5, vl); + v6 = VLEV_FLOAT(a_offset6, vl); + v7 = VLEV_FLOAT(a_offset7, vl); + v8 = VLEV_FLOAT(a_offset8, vl); + + vx8 = VSET_VX8(vx8, 0, v1); + vx8 = VSET_VX8(vx8, 1, v2); + vx8 = VSET_VX8(vx8, 2, v3); + vx8 = VSET_VX8(vx8, 3, v4); + vx8 = VSET_VX8(vx8, 4, v5); + vx8 = VSET_VX8(vx8, 5, v6); + vx8 = VSET_VX8(vx8, 6, v7); + vx8 = VSET_VX8(vx8, 7, v8); + + VSSEG8_FLOAT(b_offset, vx8, vl); + + a_offset1 += vl; + a_offset2 += vl; + a_offset3 += vl; + a_offset4 += vl; + a_offset5 += vl; + a_offset6 += vl; + a_offset7 += vl; + a_offset8 += vl; + b_offset += vl*8; + } + } + + if (n & 4) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL2(i); + + v21 = VLEV_FLOAT2(a_offset1, vl); + v22 = VLEV_FLOAT2(a_offset2, vl); + v23 = VLEV_FLOAT2(a_offset3, vl); + v24 = VLEV_FLOAT2(a_offset4, vl); + + vx24 = VSET_VX4(vx24, 0, v21); + vx24 = VSET_VX4(vx24, 1, v22); + vx24 = VSET_VX4(vx24, 2, v23); + vx24 = VSET_VX4(vx24, 3, v24); + + VSSEG4_FLOAT(b_offset, vx24, vl); + + a_offset1 += vl; + a_offset2 += vl; + a_offset3 += vl; + a_offset4 += vl; + b_offset += vl*4; + } + } + + if (n & 2) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL4(i); + + v41 = VLEV_FLOAT4(a_offset1, vl); + v42 = VLEV_FLOAT4(a_offset2, vl); + + vx42 = VSET_VX2(vx42, 0, v41); + vx42 = VSET_VX2(vx42, 1, v42); + + VSSEG2_FLOAT(b_offset, vx42, vl); + + a_offset1 += vl; + a_offset2 += vl; + b_offset += vl*2; + } + } + + if (n & 1) { + a_offset1 = a_offset; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL8(i); + + v81 = VLEV_FLOAT8(a_offset1, vl); + + VSEV_FLOAT8(b_offset, v81, vl); + + a_offset1 += vl; + b_offset += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_ncopy_4_rvv.c b/kernel/riscv64/gemm_ncopy_4_rvv.c new file mode 100644 index 0000000000..67c93b6c09 --- /dev/null +++ b/kernel/riscv64/gemm_ncopy_4_rvv.c @@ -0,0 +1,143 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define FLOAT_VX2_T vfloat32m1x2_t +#define FLOAT_VX4_T vfloat32m1x4_t +#define VSET_VX2 __riscv_vset_v_f32m1_f32m1x2 +#define VSET_VX4 __riscv_vset_v_f32m1_f32m1x4 +#define VLEV_FLOAT __riscv_vle32_v_f32m1 +#define VSEV_FLOAT __riscv_vse32_v_f32m1 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 +#else +#define VSETVL(n) __riscv_vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define FLOAT_VX2_T vfloat64m1x2_t +#define FLOAT_VX4_T vfloat64m1x4_t +#define VSET_VX2 __riscv_vset_v_f64m1_f64m1x2 +#define VSET_VX4 __riscv_vset_v_f64m1_f64m1x4 +#define VLEV_FLOAT __riscv_vle64_v_f64m1 +#define VSEV_FLOAT __riscv_vse64_v_f64m1 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 +#endif + +// Optimizes the implementation in ../generic/gemm_ncopy_4.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) +{ + BLASLONG i, j; + + FLOAT *a_offset; + FLOAT *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset; + + FLOAT_V_T v1, v2, v3, v4; + FLOAT_VX2_T vx2; + FLOAT_VX4_T vx4; + + size_t vl; + + //fprintf(stderr, "gemm_ncopy_4 m=%ld n=%ld lda=%ld\n", m, n, lda); + + a_offset = a; + b_offset = b; + + for(j = (n >> 2); j > 0; j--) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + v3 = VLEV_FLOAT(a_offset3, vl); + v4 = VLEV_FLOAT(a_offset4, vl); + + vx4 = VSET_VX4(vx4, 0, v1); + vx4 = VSET_VX4(vx4, 1, v2); + vx4 = VSET_VX4(vx4, 2, v3); + vx4 = VSET_VX4(vx4, 3, v4); + + VSSEG4_FLOAT(b_offset, vx4, vl); + + a_offset1 += vl; + a_offset2 += vl; + a_offset3 += vl; + a_offset4 += vl; + b_offset += vl*4; + } + } + + if (n & 2) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + + vx2 = VSET_VX2(vx2, 0, v1); + vx2 = VSET_VX2(vx2, 1, v2); + + VSSEG2_FLOAT(b_offset, vx2, vl); + + a_offset1 += vl; + a_offset2 += vl; + b_offset += vl*2; + } + } + + if (n & 1) { + a_offset1 = a_offset; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + + VSEV_FLOAT(b_offset, v1, vl); + + a_offset1 += vl; + b_offset += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_ncopy_8fp_rvv.c b/kernel/riscv64/gemm_ncopy_8fp_rvv.c new file mode 100644 index 0000000000..1bc90a645d --- /dev/null +++ b/kernel/riscv64/gemm_ncopy_8fp_rvv.c @@ -0,0 +1,215 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#define VSETVL(n) __riscv_vsetvl_e16m1(n) +#define VSETVL2(n) __riscv_vsetvl_e16m2(n) +#define VSETVL4(n) __riscv_vsetvl_e16m4(n) +#define VSETVL8(n) __riscv_vsetvl_e16m8(n) +#if defined(HFLOAT16) +#define FLOAT_V_T vfloat16m1_t +#define FLOAT2_V_T vfloat16m2_t +#define FLOAT4_V_T vfloat16m4_t +#define FLOAT8_V_T vfloat16m8_t +#define FLOAT_VX2_T vfloat16m4x2_t +#define FLOAT_VX4_T vfloat16m2x4_t +#define FLOAT_VX8_T vfloat16m1x8_t +#define VSET_VX2 __riscv_vset_v_f16m4_f16m4x2 +#define VSET_VX4 __riscv_vset_v_f16m2_f16m2x4 +#define VSET_VX8 __riscv_vset_v_f16m1_f16m1x8 +#define VLEV_FLOAT __riscv_vle16_v_f16m1 +#define VLEV_FLOAT2 __riscv_vle16_v_f16m2 +#define VLEV_FLOAT4 __riscv_vle16_v_f16m4 +#define VLEV_FLOAT8 __riscv_vle16_v_f16m8 +#define VSEV_FLOAT8 __riscv_vse16_v_f16m8 +#define VSSEG2_FLOAT __riscv_vsseg2e16_v_f16m4x2 +#define VSSEG4_FLOAT __riscv_vsseg4e16_v_f16m2x4 +#define VSSEG8_FLOAT __riscv_vsseg8e16_v_f16m1x8 +#else +#define FLOAT_V_T vbfloat16m1_t +#define FLOAT2_V_T vbfloat16m2_t +#define FLOAT4_V_T vbfloat16m4_t +#define FLOAT8_V_T vbfloat16m8_t +#define FLOAT_VX2_T vbfloat16m4x2_t +#define FLOAT_VX4_T vbfloat16m2x4_t +#define FLOAT_VX8_T vbfloat16m1x8_t +#define VSET_VX2 __riscv_vset_v_bf16m4_bf16m4x2 +#define VSET_VX4 __riscv_vset_v_bf16m2_bf16m2x4 +#define VSET_VX8 __riscv_vset_v_bf16m1_bf16m1x8 +#define VLEV_FLOAT __riscv_vle16_v_bf16m1 +#define VLEV_FLOAT2 __riscv_vle16_v_bf16m2 +#define VLEV_FLOAT4 __riscv_vle16_v_bf16m4 +#define VLEV_FLOAT8 __riscv_vle16_v_bf16m8 +#define VSEV_FLOAT8 __riscv_vse16_v_bf16m8 +#define VSSEG2_FLOAT __riscv_vsseg2e16_v_bf16m4x2 +#define VSSEG4_FLOAT __riscv_vsseg4e16_v_bf16m2x4 +#define VSSEG8_FLOAT __riscv_vsseg8e16_v_bf16m1x8 +#endif + +// Optimizes the implementation in ../generic/gemm_ncopy_8.c + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + + IFLOAT *a_offset; + IFLOAT *a_offset1, *a_offset2, *a_offset3, *a_offset4; + IFLOAT *a_offset5, *a_offset6, *a_offset7, *a_offset8; + IFLOAT *b_offset; + + FLOAT_V_T v1, v2, v3, v4, v5, v6, v7, v8; + FLOAT2_V_T v12, v22, v32, v42; + FLOAT4_V_T v14, v24; + FLOAT8_V_T v18; + + FLOAT_VX2_T vx2; + FLOAT_VX4_T vx4; + FLOAT_VX8_T vx8; + + size_t vl; + + //fprintf(stderr, "gemm_ncopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); + + a_offset = a; + b_offset = b; + + for(j = (n >> 3); j > 0; j--) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset5 = a_offset4 + lda; + a_offset6 = a_offset5 + lda; + a_offset7 = a_offset6 + lda; + a_offset8 = a_offset7 + lda; + a_offset += 8 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + v3 = VLEV_FLOAT(a_offset3, vl); + v4 = VLEV_FLOAT(a_offset4, vl); + v5 = VLEV_FLOAT(a_offset5, vl); + v6 = VLEV_FLOAT(a_offset6, vl); + v7 = VLEV_FLOAT(a_offset7, vl); + v8 = VLEV_FLOAT(a_offset8, vl); + + vx8 = VSET_VX8(vx8, 0, v1); + vx8 = VSET_VX8(vx8, 1, v2); + vx8 = VSET_VX8(vx8, 2, v3); + vx8 = VSET_VX8(vx8, 3, v4); + vx8 = VSET_VX8(vx8, 4, v5); + vx8 = VSET_VX8(vx8, 5, v6); + vx8 = VSET_VX8(vx8, 6, v7); + vx8 = VSET_VX8(vx8, 7, v8); + + VSSEG8_FLOAT(b_offset, vx8, vl); + + a_offset1 += vl; + a_offset2 += vl; + a_offset3 += vl; + a_offset4 += vl; + a_offset5 += vl; + a_offset6 += vl; + a_offset7 += vl; + a_offset8 += vl; + b_offset += vl*8; + } + } + + if (n & 4) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL2(i); + + v12 = VLEV_FLOAT2(a_offset1, vl); + v22 = VLEV_FLOAT2(a_offset2, vl); + v32 = VLEV_FLOAT2(a_offset3, vl); + v42 = VLEV_FLOAT2(a_offset4, vl); + + vx4 = VSET_VX4(vx4, 0, v12); + vx4 = VSET_VX4(vx4, 1, v22); + vx4 = VSET_VX4(vx4, 2, v32); + vx4 = VSET_VX4(vx4, 3, v42); + + VSSEG4_FLOAT(b_offset, vx4, vl); + + a_offset1 += vl; + a_offset2 += vl; + a_offset3 += vl; + a_offset4 += vl; + b_offset += vl*4; + } + } + + if (n & 2) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL4(i); + + v14 = VLEV_FLOAT4(a_offset1, vl); + v24 = VLEV_FLOAT4(a_offset2, vl); + + vx2 = VSET_VX2(vx2, 0, v14); + vx2 = VSET_VX2(vx2, 1, v24); + + VSSEG2_FLOAT(b_offset, vx2, vl); + + a_offset1 += vl; + a_offset2 += vl; + b_offset += vl*2; + } + } + + if (n & 1) { + a_offset1 = a_offset; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL8(i); + + v18 = VLEV_FLOAT8(a_offset1, vl); + + VSEV_FLOAT8(b_offset, v18, vl); + + a_offset1 += vl; + b_offset += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_tcopy_16_rvv.c b/kernel/riscv64/gemm_tcopy_16_rvv.c new file mode 100644 index 0000000000..4196e85722 --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_16_rvv.c @@ -0,0 +1,129 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_HALF vfloat32m2_t +#define FLOAT_V_T_QUARTER vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLEV_FLOAT_HALF __riscv_vle32_v_f32m2 +#define VLEV_FLOAT_QUARTER __riscv_vle32_v_f32m1 +#define VSEV_FLOAT __riscv_vse32_v_f32m4 +#define VSEV_FLOAT_HALF __riscv_vse32_v_f32m2 +#define VSEV_FLOAT_QUARTER __riscv_vse32_v_f32m1 +#else +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_HALF vfloat64m4_t +#define FLOAT_V_T_QUARTER vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLEV_FLOAT_HALF __riscv_vle64_v_f64m4 +#define VLEV_FLOAT_QUARTER __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VSEV_FLOAT_HALF __riscv_vse64_v_f64m4 +#define VSEV_FLOAT_QUARTER __riscv_vse64_v_f64m2 +#endif + +// Optimizes the implementation in ../generic/gemm_tcopy_16.c + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1; + + IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4, *boffset5; + + FLOAT_V_T v0; + FLOAT_V_T_HALF v1; + FLOAT_V_T_QUARTER v2; + + // fprintf(stderr, "gemm_tcopy_16 m=%ld n=%ld lda=%ld\n", m, n, lda); + + aoffset = a; + boffset = b; + boffset2 = b + m * (n & ~15); + boffset3 = b + m * (n & ~7); + boffset4 = b + m * (n & ~3); + boffset5 = b + m * (n & ~1); + + for(j = m; j > 0; j--) { + aoffset1 = aoffset; + boffset1 = boffset; + + aoffset += lda; + boffset += 16; + + for(i = (n >> 4); i > 0; i--) { + size_t vl = 16; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset1, v0, vl); + + aoffset1 += 16; + boffset1 += 16 * m; + } + + if (n & 8) { + size_t vl = 8; + + v1 = VLEV_FLOAT_HALF(aoffset1, vl); + VSEV_FLOAT_HALF(boffset2, v1, vl); + + aoffset1 += 8; + boffset2 += 8; + } + + if (n & 4) { + size_t vl = 4; + + v2 = VLEV_FLOAT_QUARTER(aoffset1, vl); + VSEV_FLOAT_QUARTER(boffset3, v2, vl); + + aoffset1 += 4; + boffset3 += 4; + } + + if (n & 2) { + *(boffset4) = *(aoffset1); + *(boffset4 + 1) = *(aoffset1 + 1); + + aoffset1 += 2; + boffset4 += 2; + } + + if (n & 1) { + *(boffset5) = *(aoffset1); + aoffset1 ++; + boffset5 ++; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_tcopy_16fp_rvv.c b/kernel/riscv64/gemm_tcopy_16fp_rvv.c new file mode 100644 index 0000000000..110ccd6900 --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_16fp_rvv.c @@ -0,0 +1,129 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(HFLOAT16) +#define FLOAT_V_T vfloat16m2_t +#define FLOAT_V_T_HALF vfloat16m1_t +#define FLOAT_V_T_QUARTER vfloat16m1_t +#define VLEV_FLOAT __riscv_vle16_v_f16m2 +#define VLEV_FLOAT_HALF __riscv_vle16_v_f16m1 +#define VLEV_FLOAT_QUARTER __riscv_vle16_v_f16m1 +#define VSEV_FLOAT __riscv_vse16_v_f16m2 +#define VSEV_FLOAT_HALF __riscv_vse16_v_f16m1 +#define VSEV_FLOAT_QUARTER __riscv_vse16_v_f16m1 +#else +#define FLOAT_V_T vbfloat16m2_t +#define FLOAT_V_T_HALF vbfloat16m1_t +#define FLOAT_V_T_QUARTER vbfloat16m1_t +#define VLEV_FLOAT __riscv_vle16_v_bf16m2 +#define VLEV_FLOAT_HALF __riscv_vle16_v_bf16m1 +#define VLEV_FLOAT_QUARTER __riscv_vle16_v_bf16m1 +#define VSEV_FLOAT __riscv_vse16_v_bf16m2 +#define VSEV_FLOAT_HALF __riscv_vse16_v_bf16m1 +#define VSEV_FLOAT_QUARTER __riscv_vse16_v_bf16m1 +#endif + +// Optimizes the implementation in ../generic/gemm_tcopy_16.c + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1; + + IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4, *boffset5; + + FLOAT_V_T v0; + FLOAT_V_T_HALF v1; + FLOAT_V_T_QUARTER v2; + + // fprintf(stderr, "gemm_tcopy_16 m=%ld n=%ld lda=%ld\n", m, n, lda); + + aoffset = a; + boffset = b; + boffset2 = b + m * (n & ~15); + boffset3 = b + m * (n & ~7); + boffset4 = b + m * (n & ~3); + boffset5 = b + m * (n & ~1); + + for(j = m; j > 0; j--) { + aoffset1 = aoffset; + boffset1 = boffset; + + aoffset += lda; + boffset += 16; + + for(i = (n >> 4); i > 0; i--) { + size_t vl = 16; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset1, v0, vl); + + aoffset1 += 16; + boffset1 += 16 * m; + } + + if (n & 8) { + size_t vl = 8; + + v1 = VLEV_FLOAT_HALF(aoffset1, vl); + VSEV_FLOAT_HALF(boffset2, v1, vl); + + aoffset1 += 8; + boffset2 += 8; + } + + if (n & 4) { + size_t vl = 4; + + v2 = VLEV_FLOAT_QUARTER(aoffset1, vl); + VSEV_FLOAT_QUARTER(boffset3, v2, vl); + + aoffset1 += 4; + boffset3 += 4; + } + + if (n & 2) { + *(boffset4) = *(aoffset1); + *(boffset4 + 1) = *(aoffset1 + 1); + + aoffset1 += 2; + boffset4 += 2; + } + + if (n & 1) { + *(boffset5) = *(aoffset1); + aoffset1 ++; + boffset5 ++; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_tcopy_4_rvv.c b/kernel/riscv64/gemm_tcopy_4_rvv.c new file mode 100644 index 0000000000..b1e6a38652 --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_4_rvv.c @@ -0,0 +1,91 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define FLOAT_V_T vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m1 +#define VSEV_FLOAT __riscv_vse32_v_f32m1 +#else +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1; + + IFLOAT *boffset, *boffset1, *boffset2, *boffset3; + + FLOAT_V_T v0; + + // fprintf(stderr, "gemm_tcopy_4 m=%ld n=%ld lda=%ld\n", m, n, lda); + + aoffset = a; + boffset = b; + boffset2 = b + m * (n & ~3); + boffset3 = b + m * (n & ~1); + + for(j = m; j > 0; j--) { + aoffset1 = aoffset; + boffset1 = boffset; + + aoffset += lda; + boffset += 4; + + for(i = (n >> 2); i > 0; i--) { + size_t vl = 4; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset1, v0, vl); + + aoffset1 += 4; + boffset1 += 4 * m; + } + + if (n & 2) { + *(boffset2) = *(aoffset1); + *(boffset2 + 1) = *(aoffset1 + 1); + + aoffset1 += 2; + boffset2 += 2; + } + + if (n & 1) { + *(boffset3) = *(aoffset1); + aoffset1 ++; + boffset3 ++; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_tcopy_8fp_rvv.c b/kernel/riscv64/gemm_tcopy_8fp_rvv.c new file mode 100644 index 0000000000..3bca2d3238 --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_8fp_rvv.c @@ -0,0 +1,109 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(HFLOAT16) +#define FLOAT_V_T vfloat16m1_t +#define FLOAT_V_T_HALF vfloat16m1_t +#define VLEV_FLOAT __riscv_vle16_v_f16m1 +#define VLEV_FLOAT_HALF __riscv_vle16_v_f16m1 +#define VSEV_FLOAT __riscv_vse16_v_f16m1 +#define VSEV_FLOAT_HALF __riscv_vse16_v_f16m1 +#else +#define FLOAT_V_T vbfloat16m1_t +#define FLOAT_V_T_HALF vbfloat16m1_t +#define VLEV_FLOAT __riscv_vle16_v_bf16m1 +#define VLEV_FLOAT_HALF __riscv_vle16_v_bf16m1 +#define VSEV_FLOAT __riscv_vse16_v_bf16m1 +#define VSEV_FLOAT_HALF __riscv_vse16_v_bf16m1 +#endif + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1; + + IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; + + FLOAT_V_T v0; + FLOAT_V_T_HALF v1; + + // fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); + + aoffset = a; + boffset = b; + boffset2 = b + m * (n & ~7); + boffset3 = b + m * (n & ~3); + boffset4 = b + m * (n & ~1); + + for(j = m; j > 0; j--) { + aoffset1 = aoffset; + boffset1 = boffset; + + aoffset += lda; + boffset += 8; + + for(i = (n >> 3); i > 0; i--) { + size_t vl = 8; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset1, v0, vl); + + aoffset1 += 8; + boffset1 += 8 * m; + } + + if (n & 4) { + size_t vl = 4; + + v1 = VLEV_FLOAT_HALF(aoffset1, vl); + VSEV_FLOAT_HALF(boffset2, v1, vl); + + aoffset1 += 4; + boffset2 += 4; + } + + if (n & 2) { + *(boffset3) = *(aoffset1); + *(boffset3 + 1) = *(aoffset1 + 1); + + aoffset1 += 2; + boffset3 += 2; + } + + if (n & 1) { + *(boffset4) = *(aoffset1); + aoffset1 ++; + boffset4 ++; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemv_n_vector.c b/kernel/riscv64/gemv_n_vector.c index 64ed532cbe..48eb4e2d68 100644 --- a/kernel/riscv64/gemv_n_vector.c +++ b/kernel/riscv64/gemv_n_vector.c @@ -26,229 +26,68 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" + #if !defined(DOUBLE) -#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) -#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) -#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m8) -#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m8) -#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m8) -#define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f32m8) -#define VFILL_ZERO_FLOAT RISCV_RVV(vfsub_vv_f32m8) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m8) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m8) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m8) #else -#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) -#define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) -#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) -#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) -#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) -#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) -#define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f64m4) -#define VFILL_ZERO_FLOAT RISCV_RVV(vfsub_vv_f64m4) +#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m8) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m8) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m8) #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i = 0, j = 0, k = 0; - BLASLONG ix = 0, iy = 0; - - if(n < 0) return(0); - FLOAT *a_ptr = a; - FLOAT temp[4]; - FLOAT_V_T va0, va1, vy0, vy1,vy0_temp, vy1_temp , temp_v ,va0_0 , va0_1 , va1_0 ,va1_1 ,va2_0 ,va2_1 ,va3_0 ,va3_1 ; - unsigned int gvl = 0; - if(inc_y == 1 && inc_x == 1){ - gvl = VSETVL(m); - if(gvl <= m/2){ - for(k=0,j=0; k 0; i -= vl) { + vl = VSETVL(i); + vy = VLEV_FLOAT(y_ptr, vl); + va = VLEV_FLOAT(a_ptr, vl); + vy = VFMACCVF_FLOAT(vy, temp, va, vl); + VSEV_FLOAT(y_ptr, vy, vl); + y_ptr += vl; + a_ptr += vl; } - VSEV_FLOAT(&y[j], vy0, gvl); - j += gvl; + x += inc_x; + a += lda; } - }else{ + } else { BLASLONG stride_y = inc_y * sizeof(FLOAT); - gvl = VSETVL(m); - if(gvl <= m/2){ - BLASLONG inc_yv = inc_y * gvl; - for(k=0,j=0; k 0; i -= vl) { + vl = VSETVL(i); + vy = VLSEV_FLOAT(y_ptr, stride_y, vl); + va = VLEV_FLOAT(a_ptr, vl); + vy = VFMACCVF_FLOAT(vy, temp, va, vl); + VSSEV_FLOAT(y_ptr, stride_y, vy, vl); + y_ptr += vl * inc_y; + a_ptr += vl; } - VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl); - j += gvl; + x += inc_x; + a += lda; } } return(0); -} \ No newline at end of file +} diff --git a/kernel/riscv64/gemv_t_vector.c b/kernel/riscv64/gemv_t_vector.c index 62b85164cb..1bdd57b6d1 100644 --- a/kernel/riscv64/gemv_t_vector.c +++ b/kernel/riscv64/gemv_t_vector.c @@ -27,110 +27,107 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) RISCV_RVV(vsetvl_e32m2)(n) -#define FLOAT_V_T vfloat32m2_t +#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) +#define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m2) -#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m2) +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) #ifdef RISCV_0p10_INTRINSICS -#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m2_f32m1(v_res, va, vb, gvl) +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m8_f32m1(v_res, va, vb, gvl) #else -#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m2_f32m1) +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m8_f32m1) #endif -#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m2) -#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m2) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m8) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) -#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m2) #define xint_t int #else -#define VSETVL(n) RISCV_RVV(vsetvl_e64m2)(n) -#define FLOAT_V_T vfloat64m2_t +#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n) +#define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m2) -#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m2) +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8) #ifdef RISCV_0p10_INTRINSICS -#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m2_f64m1(v_res, va, vb, gvl) +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m8_f64m1(v_res, va, vb, gvl) #else -#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m2_f64m1) +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m8_f64m1) #endif -#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m2) -#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m2) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m8) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8) #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) -#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m2) #define xint_t long long #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i = 0, j = 0, k = 0; - BLASLONG ix = 0, iy = 0; - FLOAT *a_ptr = a; - FLOAT temp; + BLASLONG i = 0, j = 0, k = 0; + BLASLONG ix = 0, iy = 0; + FLOAT *a_ptr = a; + FLOAT temp; - FLOAT_V_T va, vr, vx; - unsigned int gvl = 0; - FLOAT_V_T_M1 v_res; + FLOAT_V_T va, vr, vx; + unsigned int gvl = 0; + FLOAT_V_T_M1 v_res; + if(inc_x == 1){ - if(inc_x == 1){ - for(i = 0; i < n; i++){ - v_res = VFMVVF_FLOAT_M1(0, 1); - gvl = VSETVL(m); - j = 0; - vr = VFMVVF_FLOAT(0, gvl); - for(k = 0; k < m/gvl; k++){ - va = VLEV_FLOAT(&a_ptr[j], gvl); - vx = VLEV_FLOAT(&x[j], gvl); - vr = VFMULVV_FLOAT(va, vx, gvl); // could vfmacc here and reduce outside loop - v_res = VFREDSUM_FLOAT(vr, v_res, gvl); // but that reordering diverges far enough from scalar path to make tests fail - j += gvl; - } - if(j < m){ - gvl = VSETVL(m-j); - va = VLEV_FLOAT(&a_ptr[j], gvl); - vx = VLEV_FLOAT(&x[j], gvl); - vr = VFMULVV_FLOAT(va, vx, gvl); - v_res = VFREDSUM_FLOAT(vr, v_res, gvl); - } - temp = (FLOAT)EXTRACT_FLOAT(v_res); - y[iy] += alpha * temp; + for(i = 0; i < n; i++){ + v_res = VFMVVF_FLOAT_M1(0, 1); + gvl = VSETVL(m); + j = 0; + vr = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < m/gvl; k++){ + va = VLEV_FLOAT(&a_ptr[j], gvl); + vx = VLEV_FLOAT(&x[j], gvl); + vr = VFMULVV_FLOAT(va, vx, gvl); // could vfmacc here and reduce outside loop + v_res = VFREDSUM_FLOAT(vr, v_res, gvl); // but that reordering diverges far enough from scalar path to make tests fail + j += gvl; + } + if(j < m){ + gvl = VSETVL(m-j); + va = VLEV_FLOAT(&a_ptr[j], gvl); + vx = VLEV_FLOAT(&x[j], gvl); + vr = VFMULVV_FLOAT(va, vx, gvl); + v_res = VFREDSUM_FLOAT(vr, v_res, gvl); + } + temp = (FLOAT)EXTRACT_FLOAT(v_res); + y[iy] += alpha * temp; - iy += inc_y; - a_ptr += lda; + iy += inc_y; + a_ptr += lda; + } + } else { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + for(i = 0; i < n; i++){ + v_res = VFMVVF_FLOAT_M1(0, 1); + gvl = VSETVL(m); + j = 0; + ix = 0; + vr = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < m/gvl; k++){ + va = VLEV_FLOAT(&a_ptr[j], gvl); + vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vr = VFMULVV_FLOAT(va, vx, gvl); + v_res = VFREDSUM_FLOAT(vr, v_res, gvl); + j += gvl; + ix += inc_x * gvl; + } + if(j < m){ + gvl = VSETVL(m-j); + va = VLEV_FLOAT(&a_ptr[j], gvl); + vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vr = VFMULVV_FLOAT(va, vx, gvl); + v_res = VFREDSUM_FLOAT(vr, v_res, gvl); } - }else{ - BLASLONG stride_x = inc_x * sizeof(FLOAT); - for(i = 0; i < n; i++){ - v_res = VFMVVF_FLOAT_M1(0, 1); - gvl = VSETVL(m); - j = 0; - ix = 0; - vr = VFMVVF_FLOAT(0, gvl); - for(k = 0; k < m/gvl; k++){ - va = VLEV_FLOAT(&a_ptr[j], gvl); - vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); - vr = VFMULVV_FLOAT(va, vx, gvl); - v_res = VFREDSUM_FLOAT(vr, v_res, gvl); - j += gvl; - ix += inc_x * gvl; - } - if(j < m){ - gvl = VSETVL(m-j); - va = VLEV_FLOAT(&a_ptr[j], gvl); - vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); - vr = VFMULVV_FLOAT(va, vx, gvl); - v_res = VFREDSUM_FLOAT(vr, v_res, gvl); - } - temp = (FLOAT)EXTRACT_FLOAT(v_res); - y[iy] += alpha * temp; + temp = (FLOAT)EXTRACT_FLOAT(v_res); + y[iy] += alpha * temp; - iy += inc_y; - a_ptr += lda; - } + iy += inc_y; + a_ptr += lda; + } } - - return(0); + return (0); } diff --git a/kernel/riscv64/omatcopy_cn_vector.c b/kernel/riscv64/omatcopy_cn_vector.c index d079310b8e..ee31c54814 100644 --- a/kernel/riscv64/omatcopy_cn_vector.c +++ b/kernel/riscv64/omatcopy_cn_vector.c @@ -51,7 +51,7 @@ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLO FLOAT *aptr,*bptr; size_t vl; - FLOAT_V_T va, vb,va1,vb1; + FLOAT_V_T va,va1; if ( rows <= 0 ) return(0); if ( cols <= 0 ) return(0); diff --git a/kernel/riscv64/omatcopy_ct_rvv.c b/kernel/riscv64/omatcopy_ct_rvv.c new file mode 100644 index 0000000000..85afb5c0dd --- /dev/null +++ b/kernel/riscv64/omatcopy_ct_rvv.c @@ -0,0 +1,114 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +AREDISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#else +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#endif + +/***************************************************** + * Order ColMajor + * Trans with RVV optimization + ******************************************************/ + +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) +{ + BLASLONG i, j; + FLOAT *aptr, *bptr; + size_t vl; + + FLOAT_V_T va; + if (rows <= 0) return(0); + if (cols <= 0) return(0); + + aptr = a; + + if (alpha == 0.0) + { + vl = VSETVL_MAX; + va = VFMVVF_FLOAT(0, vl); + for (i = 0; i < cols; i++) + { + bptr = &b[i]; + for (j = 0; j < rows; j += vl) + { + vl = VSETVL(rows - j); + VSSEV_FLOAT(bptr + j * ldb, sizeof(FLOAT) * ldb, va, vl); + } + } + return(0); + } + + if (alpha == 1.0) + { + for (i = 0; i < cols; i++) + { + bptr = &b[i]; + for (j = 0; j < rows; j += vl) + { + vl = VSETVL(rows - j); + va = VLEV_FLOAT(aptr + j, vl); + VSSEV_FLOAT(bptr + j * ldb, sizeof(FLOAT) * ldb, va, vl); + } + aptr += lda; + } + return(0); + } + + for (i = 0; i < cols; i++) + { + bptr = &b[i]; + for (j = 0; j < rows; j += vl) + { + vl = VSETVL(rows - j); + va = VLEV_FLOAT(aptr + j, vl); + va = VFMULVF_FLOAT(va, alpha, vl); + VSSEV_FLOAT(bptr + j * ldb, sizeof(FLOAT) * ldb, va, vl); + } + aptr += lda; + } + + return(0); +} \ No newline at end of file diff --git a/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c b/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c new file mode 100644 index 0000000000..6e7b06884d --- /dev/null +++ b/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c @@ -0,0 +1,853 @@ +#include "common.h" +#include + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, BLASLONG ldc) +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + __bf16 *BB = (__bf16 *)(B); + __bf16 *AA = (__bf16 *)(A); + + // -- MAIN PASS + for (BLASLONG j=0; j + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, BLASLONG ldc) +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + __bf16 *BB = (__bf16 *)(B); + __bf16 *AA = (__bf16 *)(A); + + // -- MAIN PASS + for (BLASLONG j=0; j 0; i -= vl) { + vl = VSETVL(i); + vy = VFMVVF_FLOAT(0.0, vl); + VSEV_FLOAT(y_ptr, vy, vl); + y_ptr += vl; + } + } else if (beta != 1.0) { + for (i = m; i > 0; i -= vl) { + vl = VSETVL(i); + vy = VLEV_FLOAT(y_ptr, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + VSEV_FLOAT(y_ptr, vy, vl); + y_ptr += vl; + } + } + for (j = 0; j < n; j++) { +#if defined(HFLOAT16) + temp = (_Float16)(alpha * (FLOAT)(x_ptr[0])); + a_ptr = (_Float16 *)(a); +#else + temp = (__bf16)(alpha * (FLOAT)(x_ptr[0])); + a_ptr = (__bf16 *)(a); +#endif + y_ptr = y; + for (i = m; i > 0; i -= vl) { + vl = VSETVL(i); + vy = VLEV_FLOAT(y_ptr, vl); + va = VLEV_IFLOAT(a_ptr, vl); + vy = VFMACCVF_FLOAT(vy, temp, va, vl); + VSEV_FLOAT(y_ptr, vy, vl); + y_ptr += vl; + a_ptr += vl; + } + x_ptr += inc_x; + a += lda; + } + } else { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + if (beta == 0.0) { + for (i = m; i > 0; i -= vl) { + vl = VSETVL(i); + vy = VFMVVF_FLOAT(0.0, vl); + VSSEV_FLOAT(y_ptr, stride_y, vy, vl); + y_ptr += vl * inc_y; + } + } else if (beta != 1.0) { + for (i = m; i > 0; i -= vl) { + vl = VSETVL(i); + vy = VLSEV_FLOAT(y_ptr, stride_y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + VSSEV_FLOAT(y_ptr, stride_y, vy, vl); + y_ptr += vl * inc_y; + } + } + for (j = 0; j < n; j++) { +#if defined(HFLOAT16) + temp = (_Float16)(alpha * (FLOAT)(x_ptr[0])); + a_ptr = (_Float16 *)(a); +#else + temp = (__bf16)(alpha * (FLOAT)(x_ptr[0])); + a_ptr = (__bf16 *)(a); +#endif + y_ptr = y; + for (i = m; i > 0; i -= vl) { + vl = VSETVL(i); + vy = VLSEV_FLOAT(y_ptr, stride_y, vl); + va = VLEV_IFLOAT(a_ptr, vl); + vy = VFMACCVF_FLOAT(vy, temp, va, vl); + VSSEV_FLOAT(y_ptr, stride_y, vy, vl); + y_ptr += vl * inc_y; + a_ptr += vl; + } + x_ptr += inc_x; + a += lda; + } + } + return(0); +} diff --git a/kernel/riscv64/sbgemv_t_vector.c b/kernel/riscv64/sbgemv_t_vector.c new file mode 100644 index 0000000000..136a1f7c1f --- /dev/null +++ b/kernel/riscv64/sbgemv_t_vector.c @@ -0,0 +1,140 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) + +#define VSETVL(n) RISCV_RVV(vsetvl_e16m4)(n) + +#if defined(HFLOAT16) +#define IFLOAT_V_T vfloat16m4_t +#define VLEV_IFLOAT RISCV_RVV(vle16_v_f16m4) +#define VLSEV_IFLOAT RISCV_RVV(vlse16_v_f16m4) +#define VFMACCVV_FLOAT(a,b,c,d) RISCV_RVV(vfwmul_vv_f32m8)(b,c,d) +#else +#define IFLOAT_V_T vbfloat16m4_t +#define VLEV_IFLOAT RISCV_RVV(vle16_v_bf16m4) +#define VLSEV_IFLOAT RISCV_RVV(vlse16_v_bf16m4) +#define VFMACCVV_FLOAT RISCV_RVV(vfwmaccbf16_vv_f32m8) +#endif + +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m8_f32m1(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m8_f32m1) +#endif +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) + +int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i = 0, j = 0, k = 0; + BLASLONG ix = 0, iy = 0; +#if defined(HFLOAT16) + _Float16 *a_ptr = (_Float16 *)(a); + _Float16 *x_ptr = (_Float16 *)(x); +#else + __bf16 *a_ptr = (__bf16 *)(a); + __bf16 *x_ptr = (__bf16 *)(x); +#endif + FLOAT temp; + + IFLOAT_V_T va, vx; +#if !defined(HFLOAT16) + FLOAT_V_T vz; +#endif + FLOAT_V_T vr; + BLASLONG gvl = 0; + FLOAT_V_T_M1 v_res; + + if (inc_x == 1) { + for (i = 0; i < n; i++) { + v_res = VFMVVF_FLOAT_M1(0, 1); + gvl = VSETVL(m); + j = 0; +#if !defined(HFLOAT16) + vz = VFMVVF_FLOAT(0, gvl); +#endif + for (k = 0; k < m/gvl; k++) { + va = VLEV_IFLOAT(&a_ptr[j], gvl); + vx = VLEV_IFLOAT(&x_ptr[j], gvl); + vr = VFMACCVV_FLOAT(vz, va, vx, gvl); // could vfmacc here and reduce outside loop + v_res = VFREDSUM_FLOAT(vr, v_res, gvl); // but that reordering diverges far enough from scalar path to make tests fail + j += gvl; + } + if (j < m) { + gvl = VSETVL(m-j); + va = VLEV_IFLOAT(&a_ptr[j], gvl); + vx = VLEV_IFLOAT(&x_ptr[j], gvl); + vr = VFMACCVV_FLOAT(vz, va, vx, gvl); + v_res = VFREDSUM_FLOAT(vr, v_res, gvl); + } + temp = (FLOAT)EXTRACT_FLOAT(v_res); + y[iy] = y[iy] * beta + alpha * temp; + + iy += inc_y; + a_ptr += lda; + } + } else { + BLASLONG stride_x = inc_x * sizeof(IFLOAT); + for (i = 0; i < n; i++) { + v_res = VFMVVF_FLOAT_M1(0, 1); + gvl = VSETVL(m); + j = 0; + ix = 0; +#if !defined(HFLOAT16) + vz = VFMVVF_FLOAT(0, gvl); +#endif + for (k = 0; k < m/gvl; k++) { + va = VLEV_IFLOAT(&a_ptr[j], gvl); + vx = VLSEV_IFLOAT(&x_ptr[ix], stride_x, gvl); + vr = VFMACCVV_FLOAT(vz, va, vx, gvl); + v_res = VFREDSUM_FLOAT(vr, v_res, gvl); + j += gvl; + ix += inc_x * gvl; + } + if (j < m) { + gvl = VSETVL(m-j); + va = VLEV_IFLOAT(&a_ptr[j], gvl); + vx = VLSEV_IFLOAT(&x_ptr[ix], stride_x, gvl); + vr = VFMACCVV_FLOAT(vz, va, vx, gvl); + v_res = VFREDSUM_FLOAT(vr, v_res, gvl); + } + temp = (FLOAT)EXTRACT_FLOAT(v_res); + y[iy] = y[iy] * beta + alpha * temp; + + iy += inc_y; + a_ptr += lda; + } + } + + return (0); +} diff --git a/kernel/riscv64/shgemm_kernel_16x8_zvl256b.c b/kernel/riscv64/shgemm_kernel_16x8_zvl256b.c new file mode 100644 index 0000000000..746e07d55f --- /dev/null +++ b/kernel/riscv64/shgemm_kernel_16x8_zvl256b.c @@ -0,0 +1,969 @@ + +#include "common.h" +#include +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, BLASLONG ldc) +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + for (BLASLONG j=0; j + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, BLASLONG ldc) +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + for (BLASLONG j=0; j 0; n -= vl, x += vl*2, y += vl*2) { + n *= 2; + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { vl = VSETVL(n); - vxx2 = VLSEG_FLOAT(x, vl); - vyx2 = VLSEG_FLOAT(y, vl); - - vx0 = VGET_VX2(vxx2, 0); - vx1 = VGET_VX2(vxx2, 1); - vy0 = VGET_VX2(vyx2, 0); - vy1 = VGET_VX2(vyx2, 1); - + vx0 = VLEV_FLOAT(x, vl); + vy0 = VLEV_FLOAT(y, vl); vt0 = VFMULVF_FLOAT(vx0, c, vl); - vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); - vt1 = VFMULVF_FLOAT(vx1, c, vl); - vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl); - vy0 = VFMULVF_FLOAT(vy0, c, vl); - vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl); - vy1 = VFMULVF_FLOAT(vy1, c, vl); - vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); + vx1 = VFMACCVF_FLOAT(vt0, s, vy0, vl); - vtx2 = VSET_VX2(vtx2, 0, vt0); - vtx2 = VSET_VX2(vtx2, 1, vt1); - vyx2 = VSET_VX2(vyx2, 0, vy0); - vyx2 = VSET_VX2(vyx2, 1, vy1); - - VSSEG_FLOAT(x, vtx2, vl); - VSSEG_FLOAT(y, vyx2, vl); + vt1 = VFMULVF_FLOAT(vy0, c, vl); + vy1 = VFNMSACVF_FLOAT(vt1, s, vx0, vl); + VSEV_FLOAT(x, vx1, vl); + VSEV_FLOAT(y, vy1, vl); } } else if (inc_x == 1){ diff --git a/kernel/riscv64/zsum_vector.c b/kernel/riscv64/zsum_vector.c index ca0b02b5c2..4d0594e1aa 100644 --- a/kernel/riscv64/zsum_vector.c +++ b/kernel/riscv64/zsum_vector.c @@ -71,14 +71,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT asumf=0.0; if (n <= 0 || inc_x <= 0) return(asumf); unsigned int gvl = 0; - FLOAT_V_T v0, v1, v_zero,v_sum; + FLOAT_V_T v0, v1,v_sum; FLOAT_V_T_M1 v_res; v_res = VFMVVF_FLOAT_M1(0, 1); if(inc_x == 1){ BLASLONG n2 = n * 2; gvl = VSETVL(n2); - v_zero = VFMVVF_FLOAT(0, gvl); if(gvl <= n2/2){ v_sum = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer ab_dim1, ab_offset, c_dim1, c_offset, pt_dim1, pt_offset, q_dim1, q_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; - complex q__1, q__2, q__3; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}; /* Local variables */ integer inca; diff --git a/lapack-netlib/SRC/cgbcon.c b/lapack-netlib/SRC/cgbcon.c index 1945c221fa..38a3f73f32 100644 --- a/lapack-netlib/SRC/cgbcon.c +++ b/lapack-netlib/SRC/cgbcon.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer ab_dim1, ab_offset, i__1, i__2, i__3; real r__1, r__2; - complex q__1, q__2; + complex q__1={0.,0.}, q__2={0.,0.}; /* Local variables */ integer kase, kase1, j; diff --git a/lapack-netlib/SRC/cgbequ.c b/lapack-netlib/SRC/cgbequ.c index a7abb8f4e1..216ad1cae5 100644 --- a/lapack-netlib/SRC/cgbequ.c +++ b/lapack-netlib/SRC/cgbequ.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ integer ab_dim1, ab_offset, afb_dim1, afb_offset, b_dim1, b_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; real r__1, r__2, r__3, r__4; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer kase; diff --git a/lapack-netlib/SRC/cgbsv.c b/lapack-netlib/SRC/cgbsv.c index a5c1d7d125..b83224ba30 100644 --- a/lapack-netlib/SRC/cgbsv.c +++ b/lapack-netlib/SRC/cgbsv.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ integer ab_dim1, ab_offset, afb_dim1, afb_offset, b_dim1, b_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5; real r__1, r__2; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ real amax; diff --git a/lapack-netlib/SRC/cgbsvxx.c b/lapack-netlib/SRC/cgbsvxx.c index 774bc30d78..d75ffa9e1c 100644 --- a/lapack-netlib/SRC/cgbsvxx.c +++ b/lapack-netlib/SRC/cgbsvxx.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer ab_dim1, ab_offset, i__1, i__2, i__3, i__4; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer i__, j; diff --git a/lapack-netlib/SRC/cgbtrf.c b/lapack-netlib/SRC/cgbtrf.c index cacdd6ec24..438d5f9126 100644 --- a/lapack-netlib/SRC/cgbtrf.c +++ b/lapack-netlib/SRC/cgbtrf.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer ab_dim1, ab_offset, i__1, i__2, i__3, i__4, i__5, i__6; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ complex temp; diff --git a/lapack-netlib/SRC/cgbtrs.c b/lapack-netlib/SRC/cgbtrs.c index cd2232c14d..37f309b0c7 100644 --- a/lapack-netlib/SRC/cgbtrs.c +++ b/lapack-netlib/SRC/cgbtrs.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer ab_dim1, ab_offset, b_dim1, b_offset, i__1, i__2, i__3; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer i__, j, l; diff --git a/lapack-netlib/SRC/cgebak.c b/lapack-netlib/SRC/cgebak.c index 92cb843d06..eedb881be0 100644 --- a/lapack-netlib/SRC/cgebak.c +++ b/lapack-netlib/SRC/cgebak.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer i__; diff --git a/lapack-netlib/SRC/cgebrd.c b/lapack-netlib/SRC/cgebrd.c index 1427fa34e1..30fc1dcf27 100644 --- a/lapack-netlib/SRC/cgebrd.c +++ b/lapack-netlib/SRC/cgebrd.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; real r__1; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer i__, j; diff --git a/lapack-netlib/SRC/cgecon.c b/lapack-netlib/SRC/cgecon.c index a8f084ec84..f9276e5095 100644 --- a/lapack-netlib/SRC/cgecon.c +++ b/lapack-netlib/SRC/cgecon.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ integer a_dim1, a_offset, vl_dim1, vl_offset, vr_dim1, vr_offset, i__1, i__2, i__3; real r__1, r__2; - complex q__1, q__2; + complex q__1={0.,0.}, q__2={0.,0.}; /* Local variables */ integer ibal; diff --git a/lapack-netlib/SRC/cgeev.f b/lapack-netlib/SRC/cgeev.f index af14aa73ac..bb41599d1d 100644 --- a/lapack-netlib/SRC/cgeev.f +++ b/lapack-netlib/SRC/cgeev.f @@ -485,12 +485,12 @@ SUBROUTINE CGEEV( JOBVL, JOBVR, N, A, LDA, W, VL, LDVL, VR, LDVR, * Undo scaling if necessary * 50 CONTINUE - IF( SCALEA .AND. INFO.GT.0 ) THEN + IF( SCALEA ) THEN CALL CLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, W( INFO+1 ), $ MAX( N-INFO, 1 ), IERR ) - + IF( INFO.GT.0 ) THEN CALL CLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, W, N, IERR ) - + END IF END IF * WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) diff --git a/lapack-netlib/SRC/cgeevx.c b/lapack-netlib/SRC/cgeevx.c index 531ec71bef..3b673cdce0 100644 --- a/lapack-netlib/SRC/cgeevx.c +++ b/lapack-netlib/SRC/cgeevx.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ integer a_dim1, a_offset, vl_dim1, vl_offset, vr_dim1, vr_offset, i__1, i__2, i__3; real r__1, r__2; - complex q__1, q__2; + complex q__1={0.,0.}, q__2={0.,0.}; /* Local variables */ char side[1]; diff --git a/lapack-netlib/SRC/cgehd2.c b/lapack-netlib/SRC/cgehd2.c index 2330a27ae5..99971453fd 100644 --- a/lapack-netlib/SRC/cgehd2.c +++ b/lapack-netlib/SRC/cgehd2.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer i__; diff --git a/lapack-netlib/SRC/cgehrd.c b/lapack-netlib/SRC/cgehrd.c index a3919624bd..0322f1bf6b 100644 --- a/lapack-netlib/SRC/cgehrd.c +++ b/lapack-netlib/SRC/cgehrd.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer i__, j; diff --git a/lapack-netlib/SRC/cgejsv.c b/lapack-netlib/SRC/cgejsv.c index a3e356ff26..fde94854c7 100644 --- a/lapack-netlib/SRC/cgejsv.c +++ b/lapack-netlib/SRC/cgejsv.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ integer a_dim1, a_offset, u_dim1, u_offset, v_dim1, v_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8, i__9, i__10, i__11; real r__1, r__2, r__3; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer lwrk_cunmqr__; diff --git a/lapack-netlib/SRC/cgelq.c b/lapack-netlib/SRC/cgelq.c index 889d8985a7..f37277562e 100644 --- a/lapack-netlib/SRC/cgelq.c +++ b/lapack-netlib/SRC/cgelq.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3; - real r__1; + real r__1=0.; /* Local variables */ real anrm, bnrm; diff --git a/lapack-netlib/SRC/cgelst.c b/lapack-netlib/SRC/cgelst.c index 86c5341a94..08e24bc50a 100644 --- a/lapack-netlib/SRC/cgelst.c +++ b/lapack-netlib/SRC/cgelst.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3, i__4; - real r__1, r__2; - complex q__1; + real r__1=0., r__2=0.; + complex q__1={0.,0.}; /* Local variables */ real anrm, bnrm, smin, smax; diff --git a/lapack-netlib/SRC/cgemlq.c b/lapack-netlib/SRC/cgemlq.c index cbc80360b9..73c90b07a6 100644 --- a/lapack-netlib/SRC/cgemlq.c +++ b/lapack-netlib/SRC/cgemlq.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer i__, k; diff --git a/lapack-netlib/SRC/cgeqlf.c b/lapack-netlib/SRC/cgeqlf.c index e923ace8af..a2721241b6 100644 --- a/lapack-netlib/SRC/cgeqlf.c +++ b/lapack-netlib/SRC/cgeqlf.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer nfxd, j, nbmin; diff --git a/lapack-netlib/SRC/cgeqp3rk.c b/lapack-netlib/SRC/cgeqp3rk.c index 3afd9fc89f..ff07e9aedf 100644 --- a/lapack-netlib/SRC/cgeqp3rk.c +++ b/lapack-netlib/SRC/cgeqp3rk.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer i__, k; diff --git a/lapack-netlib/SRC/cgeqr2p.c b/lapack-netlib/SRC/cgeqr2p.c index f93abd79eb..70ec2fddf0 100644 --- a/lapack-netlib/SRC/cgeqr2p.c +++ b/lapack-netlib/SRC/cgeqr2p.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, t_dim1, t_offset, i__1, i__2, i__3, i__4, i__5; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer i__, j; diff --git a/lapack-netlib/SRC/cgerfs.c b/lapack-netlib/SRC/cgerfs.c index 3931e4716d..a65dc5aaa7 100644 --- a/lapack-netlib/SRC/cgerfs.c +++ b/lapack-netlib/SRC/cgerfs.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer a_dim1, a_offset, af_dim1, af_offset, b_dim1, b_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2, r__3, r__4; - complex q__1; + real r__1=0., r__2=0., r__3=0., r__4=0.; + complex q__1={0.,0.}; /* Local variables */ integer kase; diff --git a/lapack-netlib/SRC/cgerq2.c b/lapack-netlib/SRC/cgerq2.c index 205f6d3b95..2d7cc977df 100644 --- a/lapack-netlib/SRC/cgerq2.c +++ b/lapack-netlib/SRC/cgerq2.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6; - real r__1; - complex q__1, q__2, q__3; + real r__1=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}; /* Local variables */ complex temp; diff --git a/lapack-netlib/SRC/cgesdd.c b/lapack-netlib/SRC/cgesdd.c index 15f270501a..32a6dc8e6e 100644 --- a/lapack-netlib/SRC/cgesdd.c +++ b/lapack-netlib/SRC/cgesdd.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer a_dim1, a_offset, v_dim1, v_offset, i__1, i__2, i__3, i__4, i__5, i__6; - real r__1, r__2; - complex q__1, q__2, q__3; + real r__1=0., r__2=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}; /* Local variables */ real aapp; diff --git a/lapack-netlib/SRC/cgesvx.c b/lapack-netlib/SRC/cgesvx.c index 952ac564cb..e94b789b43 100644 --- a/lapack-netlib/SRC/cgesvx.c +++ b/lapack-netlib/SRC/cgesvx.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer a_dim1, a_offset, af_dim1, af_offset, b_dim1, b_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2; - complex q__1; + real r__1=0., r__2=0.; + complex q__1={0.,0.}; /* Local variables */ real amax; diff --git a/lapack-netlib/SRC/cgesvxx.c b/lapack-netlib/SRC/cgesvxx.c index a0f997545f..845205b190 100644 --- a/lapack-netlib/SRC/cgesvxx.c +++ b/lapack-netlib/SRC/cgesvxx.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3; - real r__1; - complex q__1; + real r__1=0.; + complex q__1={0.,0.}; /* Local variables */ real smin, xmax; diff --git a/lapack-netlib/SRC/cgetf2.c b/lapack-netlib/SRC/cgetf2.c index 8418162dbf..f89d5d819d 100644 --- a/lapack-netlib/SRC/cgetf2.c +++ b/lapack-netlib/SRC/cgetf2.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer i__, j; diff --git a/lapack-netlib/SRC/cgetrf.c b/lapack-netlib/SRC/cgetrf.c index 6202fc1496..a732d9597e 100644 --- a/lapack-netlib/SRC/cgetrf.c +++ b/lapack-netlib/SRC/cgetrf.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer i__, j; diff --git a/lapack-netlib/SRC/cgetrf2.c b/lapack-netlib/SRC/cgetrf2.c index 4592fbafba..562f0581ba 100644 --- a/lapack-netlib/SRC/cgetrf2.c +++ b/lapack-netlib/SRC/cgetrf2.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer i__, j; diff --git a/lapack-netlib/SRC/cgetrs.c b/lapack-netlib/SRC/cgetrs.c index ad5dfe7ce1..fb6cc83510 100644 --- a/lapack-netlib/SRC/cgetrs.c +++ b/lapack-netlib/SRC/cgetrs.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, t_dim1, t_offset, i__1, i__2, i__3, i__4; - real r__1, r__2, r__3; - complex q__1, q__2; + real r__1=0., r__2=0., r__3=0.; + complex q__1={0.,0.}, q__2={0.,0.}; /* Local variables */ integer ldwt, lworkopt, i__, j; diff --git a/lapack-netlib/SRC/cggbak.c b/lapack-netlib/SRC/cggbak.c index c3aa839459..ed4bc28793 100644 --- a/lapack-netlib/SRC/cggbak.c +++ b/lapack-netlib/SRC/cggbak.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, vsl_dim1, vsl_offset, vsr_dim1, vsr_offset, i__1, i__2; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ real anrm, bnrm; diff --git a/lapack-netlib/SRC/cggesx.c b/lapack-netlib/SRC/cggesx.c index ec6f4152cb..3d4c1cb2ab 100644 --- a/lapack-netlib/SRC/cggesx.c +++ b/lapack-netlib/SRC/cggesx.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, vl_dim1, vl_offset, vr_dim1, vr_offset, i__1, i__2, i__3, i__4; - real r__1, r__2, r__3, r__4; - complex q__1; + real r__1=0., r__2=0., r__3=0., r__4=0.; + complex q__1={0.,0.}; /* Local variables */ real anrm, bnrm; diff --git a/lapack-netlib/SRC/cggev3.c b/lapack-netlib/SRC/cggev3.c index a5768ca118..9ffae3f008 100644 --- a/lapack-netlib/SRC/cggev3.c +++ b/lapack-netlib/SRC/cggev3.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, vl_dim1, vl_offset, vr_dim1, vr_offset, i__1, i__2, i__3, i__4; - real r__1, r__2, r__3, r__4; - complex q__1; + real r__1=0., r__2=0., r__3=0., r__4=0.; + complex q__1={0.,0.}; /* Local variables */ real anrm, bnrm; diff --git a/lapack-netlib/SRC/cggevx.c b/lapack-netlib/SRC/cggevx.c index 8c17497ceb..d89a764f93 100644 --- a/lapack-netlib/SRC/cggevx.c +++ b/lapack-netlib/SRC/cggevx.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, vl_dim1, vl_offset, vr_dim1, vr_offset, i__1, i__2, i__3, i__4; - real r__1, r__2, r__3, r__4; - complex q__1; + real r__1=0., r__2=0., r__3=0., r__4=0.; + complex q__1={0.,0.}; /* Local variables */ real anrm, bnrm; diff --git a/lapack-netlib/SRC/cggglm.c b/lapack-netlib/SRC/cggglm.c index b8b69614bf..8757c72d00 100644 --- a/lapack-netlib/SRC/cggglm.c +++ b/lapack-netlib/SRC/cggglm.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3, i__4; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer lopt, i__; diff --git a/lapack-netlib/SRC/cgghd3.c b/lapack-netlib/SRC/cgghd3.c index 4394805abd..c88e5b7044 100644 --- a/lapack-netlib/SRC/cgghd3.c +++ b/lapack-netlib/SRC/cgghd3.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, q_dim1, q_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8, i__9; - complex q__1, q__2, q__3, q__4; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}; /* Local variables */ logical blk22; diff --git a/lapack-netlib/SRC/cgghrd.c b/lapack-netlib/SRC/cgghrd.c index 649c69b42f..f135036a5e 100644 --- a/lapack-netlib/SRC/cgghrd.c +++ b/lapack-netlib/SRC/cgghrd.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, q_dim1, q_offset, z_dim1, z_offset, i__1, i__2, i__3; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer jcol; diff --git a/lapack-netlib/SRC/cgglse.c b/lapack-netlib/SRC/cgglse.c index 1a991c0c72..7f03434287 100644 --- a/lapack-netlib/SRC/cgglse.c +++ b/lapack-netlib/SRC/cgglse.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3, i__4; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer lopt; diff --git a/lapack-netlib/SRC/cggqrf.c b/lapack-netlib/SRC/cggqrf.c index e59ef8f8b4..fc48b1c41b 100644 --- a/lapack-netlib/SRC/cggqrf.c +++ b/lapack-netlib/SRC/cggqrf.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer a_dim1, a_offset, v_dim1, v_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; - real r__1, r__2; - complex q__1, q__2, q__3; + real r__1=0., r__2=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}; /* Local variables */ real aapp; diff --git a/lapack-netlib/SRC/cgsvj1.c b/lapack-netlib/SRC/cgsvj1.c index d52880f3ba..104d142969 100644 --- a/lapack-netlib/SRC/cgsvj1.c +++ b/lapack-netlib/SRC/cgsvj1.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer a_dim1, a_offset, v_dim1, v_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; - real r__1, r__2; - complex q__1, q__2, q__3; + real r__1=0., r__2=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}; /* Local variables */ integer nblc; diff --git a/lapack-netlib/SRC/cgtcon.c b/lapack-netlib/SRC/cgtcon.c index bf0c32e9ff..42eaed144c 100644 --- a/lapack-netlib/SRC/cgtcon.c +++ b/lapack-netlib/SRC/cgtcon.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer b_dim1, b_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8, i__9; - real r__1, r__2, r__3, r__4, r__5, r__6, r__7, r__8, r__9, r__10, r__11, - r__12, r__13, r__14; - complex q__1; + real r__1=0., r__2=0., r__3=0., r__4=0., r__5=0., r__6=0., r__7=0., r__8=0., r__9=0., r__10=0., r__11=0., + r__12=0., r__13=0., r__14=0.; + complex q__1={0.,0.}; /* Local variables */ integer kase; diff --git a/lapack-netlib/SRC/cgtsv.c b/lapack-netlib/SRC/cgtsv.c index 7982e59bc8..5022d9f269 100644 --- a/lapack-netlib/SRC/cgtsv.c +++ b/lapack-netlib/SRC/cgtsv.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer i__1, i__2, i__3, i__4; - real r__1, r__2, r__3, r__4; - complex q__1, q__2; + real r__1=0., r__2=0., r__3=0., r__4=0.; + complex q__1={0.,0.}, q__2={0.,0.}; /* Local variables */ complex fact, temp; diff --git a/lapack-netlib/SRC/cgttrs.c b/lapack-netlib/SRC/cgttrs.c index e41b649a5b..5fefaf057c 100644 --- a/lapack-netlib/SRC/cgttrs.c +++ b/lapack-netlib/SRC/cgttrs.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer b_dim1, b_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8; - complex q__1, q__2, q__3, q__4, q__5, q__6, q__7, q__8; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}, q__5={0.,0.}, q__6={0.,0.}, q__7={0.,0.}, q__8={0.,0.}; /* Local variables */ - complex temp; + complex temp={0.,0.}; integer i__, j; diff --git a/lapack-netlib/SRC/chb2st_kernels.c b/lapack-netlib/SRC/chb2st_kernels.c index efb4158d01..6448d4d070 100644 --- a/lapack-netlib/SRC/chb2st_kernels.c +++ b/lapack-netlib/SRC/chb2st_kernels.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ complex ctmp; diff --git a/lapack-netlib/SRC/chbev.c b/lapack-netlib/SRC/chbev.c index 5b26dd38e5..30c101ba25 100644 --- a/lapack-netlib/SRC/chbev.c +++ b/lapack-netlib/SRC/chbev.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer ab_dim1, ab_offset, bb_dim1, bb_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8; - real r__1; - complex q__1, q__2, q__3, q__4, q__5, q__6, q__7, q__8, q__9, q__10; + real r__1=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}, q__5={0.,0.}, q__6={0.,0.}, q__7={0.,0.}, q__8={0.,0.}, q__9={0.,0.}, q__10={0.,0.}; /* Local variables */ integer inca; diff --git a/lapack-netlib/SRC/chbgv.c b/lapack-netlib/SRC/chbgv.c index fd25c6f785..dcd2eed499 100644 --- a/lapack-netlib/SRC/chbgv.c +++ b/lapack-netlib/SRC/chbgv.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer ab_dim1, ab_offset, q_dim1, q_offset, i__1, i__2, i__3, i__4, i__5, i__6; - real r__1; - complex q__1; + real r__1=0.; + complex q__1={0.,0.}; /* Local variables */ integer inca, jend, lend, jinc; diff --git a/lapack-netlib/SRC/checon.c b/lapack-netlib/SRC/checon.c index 56b65a7ec2..580d19f450 100644 --- a/lapack-netlib/SRC/checon.c +++ b/lapack-netlib/SRC/checon.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2; - real r__1, r__2; - complex q__1; + real r__1=0., r__2=0.; + complex q__1={0.,0.}; /* Local variables */ extern /* Subroutine */ void cher2_(char *, integer *, complex *, complex * diff --git a/lapack-netlib/SRC/chegst.c b/lapack-netlib/SRC/chegst.c index 9634a8036c..fe44fbe88c 100644 --- a/lapack-netlib/SRC/chegst.c +++ b/lapack-netlib/SRC/chegst.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer k; diff --git a/lapack-netlib/SRC/chegv.c b/lapack-netlib/SRC/chegv.c index 00fad8162c..ec48b865f7 100644 --- a/lapack-netlib/SRC/chegv.c +++ b/lapack-netlib/SRC/chegv.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer a_dim1, a_offset, af_dim1, af_offset, b_dim1, b_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2, r__3, r__4; - complex q__1; + real r__1=0., r__2=0., r__3=0., r__4=0.; + complex q__1={0.,0.}; /* Local variables */ integer kase; diff --git a/lapack-netlib/SRC/chesv.c b/lapack-netlib/SRC/chesv.c index 1754443d20..3952b4a838 100644 --- a/lapack-netlib/SRC/chesv.c +++ b/lapack-netlib/SRC/chesv.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer i__; diff --git a/lapack-netlib/SRC/chetd2.c b/lapack-netlib/SRC/chetd2.c index f459875db5..d2042499aa 100644 --- a/lapack-netlib/SRC/chetd2.c +++ b/lapack-netlib/SRC/chetd2.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3; - real r__1; - complex q__1, q__2, q__3, q__4; + real r__1=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}; /* Local variables */ complex taui; diff --git a/lapack-netlib/SRC/chetf2.c b/lapack-netlib/SRC/chetf2.c index d664807603..fae6f83fa9 100644 --- a/lapack-netlib/SRC/chetf2.c +++ b/lapack-netlib/SRC/chetf2.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6; - real r__1, r__2, r__3, r__4; - complex q__1, q__2, q__3, q__4, q__5, q__6; + real r__1=0., r__2=0., r__3=0., r__4=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}, q__5={0.,0.}, q__6={0.,0.}; /* Local variables */ extern /* Subroutine */ void cher_(char *, integer *, real *, complex *, diff --git a/lapack-netlib/SRC/chetf2_rk.c b/lapack-netlib/SRC/chetf2_rk.c index 1ee4af9410..c21de730bf 100644 --- a/lapack-netlib/SRC/chetf2_rk.c +++ b/lapack-netlib/SRC/chetf2_rk.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6; - real r__1, r__2; - complex q__1, q__2, q__3, q__4, q__5, q__6, q__7, q__8; + real r__1=0., r__2=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}, q__5={0.,0.}, q__6={0.,0.}, q__7={0.,0.}, q__8={0.,0.}; /* Local variables */ extern /* Subroutine */ void cher_(char *, integer *, real *, complex *, diff --git a/lapack-netlib/SRC/chetf2_rook.c b/lapack-netlib/SRC/chetf2_rook.c index 922c29fbbf..f4b15eb622 100644 --- a/lapack-netlib/SRC/chetf2_rook.c +++ b/lapack-netlib/SRC/chetf2_rook.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6; - real r__1, r__2; - complex q__1, q__2, q__3, q__4, q__5, q__6, q__7, q__8; + real r__1=0., r__2=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}, q__5={0.,0.}, q__6={0.,0.}, q__7={0.,0.}, q__8={0.,0.}; /* Local variables */ extern /* Subroutine */ void cher_(char *, integer *, real *, complex *, diff --git a/lapack-netlib/SRC/chetrd.c b/lapack-netlib/SRC/chetrd.c index 075071e345..1d219e976f 100644 --- a/lapack-netlib/SRC/chetrd.c +++ b/lapack-netlib/SRC/chetrd.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer i__, j; diff --git a/lapack-netlib/SRC/chetrd_2stage.c b/lapack-netlib/SRC/chetrd_2stage.c index 861051794a..8549efc710 100644 --- a/lapack-netlib/SRC/chetrd_2stage.c +++ b/lapack-netlib/SRC/chetrd_2stage.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer ab_dim1, ab_offset, i__1, i__2, i__3, i__4, i__5; - real r__1; - complex q__1; + real r__1=0.; + complex q__1={0.,0.}; /* Local variables */ integer inda; diff --git a/lapack-netlib/SRC/chetrd_he2hb.c b/lapack-netlib/SRC/chetrd_he2hb.c index 1bf08ed686..028b6cd3fa 100644 --- a/lapack-netlib/SRC/chetrd_he2hb.c +++ b/lapack-netlib/SRC/chetrd_he2hb.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer a_dim1, a_offset, ab_dim1, ab_offset, i__1, i__2, i__3, i__4, i__5; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ extern integer ilaenv2stage_(integer *, char *, char *, integer *, diff --git a/lapack-netlib/SRC/chetrf.c b/lapack-netlib/SRC/chetrf.c index 112140771c..c5a43be28d 100644 --- a/lapack-netlib/SRC/chetrf.c +++ b/lapack-netlib/SRC/chetrf.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4; - real r__1; - complex q__1; + real r__1=0.; + complex q__1={0.,0.}; /* Local variables */ extern /* Subroutine */ void clahef_aa_(char *, integer *, integer *, diff --git a/lapack-netlib/SRC/chetrf_aa_2stage.c b/lapack-netlib/SRC/chetrf_aa_2stage.c index bb34f96f01..8db6aab905 100644 --- a/lapack-netlib/SRC/chetrf_aa_2stage.c +++ b/lapack-netlib/SRC/chetrf_aa_2stage.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4; - real r__1; - complex q__1; + real r__1=0.; + complex q__1={0.,0.}; /* Local variables */ integer ldtb, i__, j, k; diff --git a/lapack-netlib/SRC/chetrf_rk.c b/lapack-netlib/SRC/chetrf_rk.c index 0bd16a089e..b091f09354 100644 --- a/lapack-netlib/SRC/chetrf_rk.c +++ b/lapack-netlib/SRC/chetrf_rk.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3; - real r__1; - complex q__1, q__2; + real r__1=0.; + complex q__1={0.,0.}, q__2={0.,0.}; /* Local variables */ complex temp, akkp1; diff --git a/lapack-netlib/SRC/chetri2.c b/lapack-netlib/SRC/chetri2.c index 8f319e4e95..864b199a15 100644 --- a/lapack-netlib/SRC/chetri2.c +++ b/lapack-netlib/SRC/chetri2.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer a_dim1, a_offset, work_dim1, work_offset, i__1, i__2, i__3, i__4, i__5, i__6; - real r__1; - complex q__1, q__2, q__3; + real r__1=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}; /* Local variables */ integer invd; diff --git a/lapack-netlib/SRC/chetri_3.c b/lapack-netlib/SRC/chetri_3.c index b966d1dbdb..91222597e5 100644 --- a/lapack-netlib/SRC/chetri_3.c +++ b/lapack-netlib/SRC/chetri_3.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3; - real r__1; - complex q__1, q__2; + real r__1=0.; + complex q__1={0.,0.}, q__2={0.,0.}; /* Local variables */ complex temp, akkp1; diff --git a/lapack-netlib/SRC/chetrs.c b/lapack-netlib/SRC/chetrs.c index 32d58c7b16..eeb133e7dd 100644 --- a/lapack-netlib/SRC/chetrs.c +++ b/lapack-netlib/SRC/chetrs.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2; - complex q__1, q__2, q__3; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}; /* Local variables */ complex akm1k; diff --git a/lapack-netlib/SRC/chetrs2.c b/lapack-netlib/SRC/chetrs2.c index 31e7c817d9..3d06e6bee2 100644 --- a/lapack-netlib/SRC/chetrs2.c +++ b/lapack-netlib/SRC/chetrs2.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2; - complex q__1, q__2, q__3; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}; /* Local variables */ complex akm1k; diff --git a/lapack-netlib/SRC/chfrk.c b/lapack-netlib/SRC/chfrk.c index 23d23355c3..9089aac8e8 100644 --- a/lapack-netlib/SRC/chfrk.c +++ b/lapack-netlib/SRC/chfrk.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer h_dim1, h_offset, q_dim1, q_offset, t_dim1, t_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5, i__6; - real r__1, r__2, r__3, r__4, r__5, r__6; - complex q__1, q__2, q__3, q__4, q__5, q__6, q__7; + real r__1=0., r__2=0., r__3=0., r__4=0., r__5=0., r__6=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}, q__5={0.,0.}, q__6={0.,0.}, q__7={0.,0.}; /* Local variables */ real absb, atol, btol, temp; diff --git a/lapack-netlib/SRC/chla_transtype.c b/lapack-netlib/SRC/chla_transtype.c index 7b0781d0fb..c4e3252c53 100644 --- a/lapack-netlib/SRC/chla_transtype.c +++ b/lapack-netlib/SRC/chla_transtype.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer i__1, i__2, i__3, i__4; - real r__1, r__2; - complex q__1, q__2, q__3; + real r__1=0., r__2=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}; /* Local variables */ extern /* Subroutine */ void chpr2_(char *, integer *, complex *, complex * diff --git a/lapack-netlib/SRC/chpgv.c b/lapack-netlib/SRC/chpgv.c index 1c079e7538..0a3edc10c4 100644 --- a/lapack-netlib/SRC/chpgv.c +++ b/lapack-netlib/SRC/chpgv.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer b_dim1, b_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2, r__3, r__4; - complex q__1; + real r__1=0., r__2=0., r__3=0., r__4=0.; + complex q__1={0.,0.}; /* Local variables */ integer kase; diff --git a/lapack-netlib/SRC/chpsv.c b/lapack-netlib/SRC/chpsv.c index b10f9503e6..ab3738cb77 100644 --- a/lapack-netlib/SRC/chpsv.c +++ b/lapack-netlib/SRC/chpsv.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer i__1, i__2, i__3; - real r__1; - complex q__1, q__2, q__3, q__4; + real r__1=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}; /* Local variables */ complex taui; diff --git a/lapack-netlib/SRC/chptrf.c b/lapack-netlib/SRC/chptrf.c index 9bbcd22f05..6193c9ebcb 100644 --- a/lapack-netlib/SRC/chptrf.c +++ b/lapack-netlib/SRC/chptrf.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer i__1, i__2, i__3, i__4, i__5, i__6; - real r__1, r__2, r__3, r__4; - complex q__1, q__2, q__3, q__4, q__5, q__6; + real r__1=0., r__2=0., r__3=0., r__4=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}, q__5={0.,0.}, q__6={0.,0.}; /* Local variables */ extern /* Subroutine */ void chpr_(char *, integer *, real *, complex *, diff --git a/lapack-netlib/SRC/chptri.c b/lapack-netlib/SRC/chptri.c index 054664681d..c2320d7614 100644 --- a/lapack-netlib/SRC/chptri.c +++ b/lapack-netlib/SRC/chptri.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer i__1, i__2, i__3; - real r__1; - complex q__1, q__2; + real r__1=0.; + complex q__1={0.,0.}, q__2={0.,0.}; /* Local variables */ complex temp, akkp1; diff --git a/lapack-netlib/SRC/chptrs.c b/lapack-netlib/SRC/chptrs.c index 919330f673..a80c6ea381 100644 --- a/lapack-netlib/SRC/chptrs.c +++ b/lapack-netlib/SRC/chptrs.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer b_dim1, b_offset, i__1, i__2; - complex q__1, q__2, q__3; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}; /* Local variables */ complex akm1k; diff --git a/lapack-netlib/SRC/chsein.c b/lapack-netlib/SRC/chsein.c index 87c177bc64..b7fd6101eb 100644 --- a/lapack-netlib/SRC/chsein.c +++ b/lapack-netlib/SRC/chsein.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer h_dim1, h_offset, vl_dim1, vl_offset, vr_dim1, vr_offset, i__1, i__2, i__3; - real r__1, r__2; - complex q__1, q__2; + real r__1=0., r__2=0.; + complex q__1={0.,0.}, q__2={0.,0.}; /* Local variables */ real unfl; diff --git a/lapack-netlib/SRC/chseqr.c b/lapack-netlib/SRC/chseqr.c index b98db683b1..a29094e831 100644 --- a/lapack-netlib/SRC/chseqr.c +++ b/lapack-netlib/SRC/chseqr.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ address a__1[2]; integer h_dim1, h_offset, z_dim1, z_offset, i__1, i__2, i__3[2]; - real r__1, r__2, r__3; - complex q__1; + real r__1=0., r__2=0., r__3=0.; + complex q__1={0.,0.}; char ch__1[2]; /* Local variables */ diff --git a/lapack-netlib/SRC/cla_gbamv.c b/lapack-netlib/SRC/cla_gbamv.c index 1d3b549214..d4032e3135 100644 --- a/lapack-netlib/SRC/cla_gbamv.c +++ b/lapack-netlib/SRC/cla_gbamv.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer ayb_dim1, ayb_offset, res_dim1, res_offset, i__1, i__2, i__3, i__4; - real r__1, r__2, r__3; - complex q__1, q__2, q__3; + real r__1=0., r__2=0., r__3=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}; /* Local variables */ real safe1; diff --git a/lapack-netlib/SRC/cla_porcond_c.c b/lapack-netlib/SRC/cla_porcond_c.c index e282fb42a3..fd3fd1cdb9 100644 --- a/lapack-netlib/SRC/cla_porcond_c.c +++ b/lapack-netlib/SRC/cla_porcond_c.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer i__1, i__2, i__3, i__4, i__5; - complex q__1, q__2, q__3; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}; /* Local variables */ integer i__; diff --git a/lapack-netlib/SRC/clabrd.c b/lapack-netlib/SRC/clabrd.c index 0d1d299a8f..725d684ee1 100644 --- a/lapack-netlib/SRC/clabrd.c +++ b/lapack-netlib/SRC/clabrd.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer a_dim1, a_offset, x_dim1, x_offset, y_dim1, y_offset, i__1, i__2, i__3; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer i__; diff --git a/lapack-netlib/SRC/clacgv.c b/lapack-netlib/SRC/clacgv.c index 46e8e115ed..0f099aed51 100644 --- a/lapack-netlib/SRC/clacgv.c +++ b/lapack-netlib/SRC/clacgv.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer i__1, i__2; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer ioff, i__; diff --git a/lapack-netlib/SRC/clacn2.c b/lapack-netlib/SRC/clacn2.c index f4138bdc7d..2518e3ffaf 100644 --- a/lapack-netlib/SRC/clacn2.c +++ b/lapack-netlib/SRC/clacn2.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer i__1, i__2, i__3; - real r__1, r__2; - complex q__1; + real r__1=0., r__2=0.; + complex q__1={0.,0.}; /* Local variables */ real temp; diff --git a/lapack-netlib/SRC/clacon.c b/lapack-netlib/SRC/clacon.c index 52187649f9..8fe2f5bfc9 100644 --- a/lapack-netlib/SRC/clacon.c +++ b/lapack-netlib/SRC/clacon.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer i__1, i__2, i__3; - real r__1, r__2; - complex q__1; + real r__1=0., r__2=0.; + complex q__1={0.,0.}; /* Local variables */ static integer iter; diff --git a/lapack-netlib/SRC/clacp2.c b/lapack-netlib/SRC/clacp2.c index 078018ef79..f73072f5fc 100644 --- a/lapack-netlib/SRC/clacp2.c +++ b/lapack-netlib/SRC/clacp2.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer b_dim1, b_offset, a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5; - real r__1; - complex q__1; + real r__1=0.; + complex q__1={0.,0.}; /* Local variables */ integer i__, j, l; diff --git a/lapack-netlib/SRC/clacrt.c b/lapack-netlib/SRC/clacrt.c index 7faa247797..e800fd1a34 100644 --- a/lapack-netlib/SRC/clacrt.c +++ b/lapack-netlib/SRC/clacrt.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer i__1, i__2, i__3, i__4; - complex q__1, q__2, q__3; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}; /* Local variables */ integer i__; diff --git a/lapack-netlib/SRC/cladiv.c b/lapack-netlib/SRC/cladiv.c index 10ffa2a284..5c948efd08 100644 --- a/lapack-netlib/SRC/cladiv.c +++ b/lapack-netlib/SRC/cladiv.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* Complex */ VOID cladiv_(complex * ret_val, complex *x, complex *y) { /* System generated locals */ - real r__1, r__2, r__3, r__4; - complex q__1; + real r__1=0., r__2=0., r__3=0., r__4=0.; + complex q__1={0.,0.}; /* Local variables */ real zi, zr; diff --git a/lapack-netlib/SRC/claed0.c b/lapack-netlib/SRC/claed0.c index 381ca74c1f..ac907f9b90 100644 --- a/lapack-netlib/SRC/claed0.c +++ b/lapack-netlib/SRC/claed0.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer b_dim1, b_offset, h_dim1, h_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2, r__3, r__4; - complex q__1, q__2; + real r__1=0., r__2=0., r__3=0., r__4=0.; + complex q__1={0.,0.}, q__2={0.,0.}; /* Local variables */ integer ierr; diff --git a/lapack-netlib/SRC/claesy.c b/lapack-netlib/SRC/claesy.c index 81440412bd..ebf33ff5a3 100644 --- a/lapack-netlib/SRC/claesy.c +++ b/lapack-netlib/SRC/claesy.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -376,19 +376,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ rt1, complex *rt2, complex *evscal, complex *cs1, complex *sn1) { /* System generated locals */ - real r__1, r__2; - complex q__1, q__2, q__3, q__4, q__5, q__6, q__7; + real r__1=0., r__2=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}, q__5={0.,0.}, q__6={0.,0.}, q__7={0.,0.}; /* Local variables */ real babs, tabs; diff --git a/lapack-netlib/SRC/claev2.c b/lapack-netlib/SRC/claev2.c index 69736b44c1..cc47d555a9 100644 --- a/lapack-netlib/SRC/claev2.c +++ b/lapack-netlib/SRC/claev2.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ real *rt2, real *cs1, complex *sn1) { /* System generated locals */ - real r__1, r__2, r__3; - complex q__1, q__2; + real r__1=0., r__2=0., r__3=0.; + complex q__1={0.,0.}, q__2={0.,0.}; /* Local variables */ real t; diff --git a/lapack-netlib/SRC/clag2z.c b/lapack-netlib/SRC/clag2z.c index ff455c7e2e..46021a22fc 100644 --- a/lapack-netlib/SRC/clag2z.c +++ b/lapack-netlib/SRC/clag2z.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ complex *snv, real *csq, complex *snq) { /* System generated locals */ - real r__1, r__2, r__3, r__4, r__5, r__6, r__7, r__8; - complex q__1, q__2, q__3, q__4, q__5; + real r__1=0., r__2=0., r__3=0., r__4=0., r__5=0., r__6=0., r__7=0., r__8=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}, q__5={0.,0.}; /* Local variables */ real aua11, aua12, aua21, aua22, avb11, avb12, avb21, avb22, ua11r, ua22r, diff --git a/lapack-netlib/SRC/clagtm.c b/lapack-netlib/SRC/clagtm.c index 0bc287e046..9d1ff8f380 100644 --- a/lapack-netlib/SRC/clagtm.c +++ b/lapack-netlib/SRC/clagtm.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer b_dim1, b_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8, i__9, i__10; - complex q__1, q__2, q__3, q__4, q__5, q__6, q__7, q__8, q__9; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}, q__5={0.,0.}, q__6={0.,0.}, q__7={0.,0.}, q__8={0.,0.}, q__9={0.,0.}; /* Local variables */ integer i__, j; diff --git a/lapack-netlib/SRC/clahef.c b/lapack-netlib/SRC/clahef.c index 42bd1ac771..c5c0fe9919 100644 --- a/lapack-netlib/SRC/clahef.c +++ b/lapack-netlib/SRC/clahef.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2, r__3, r__4; - complex q__1, q__2, q__3, q__4; + real r__1=0., r__2=0., r__3=0., r__4=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}; /* Local variables */ integer imax, jmax, j, k; diff --git a/lapack-netlib/SRC/clahef_aa.c b/lapack-netlib/SRC/clahef_aa.c index 10e024a759..5a39d3939f 100644 --- a/lapack-netlib/SRC/clahef_aa.c +++ b/lapack-netlib/SRC/clahef_aa.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, h_dim1, h_offset, i__1, i__2; - real r__1; - complex q__1, q__2; + real r__1=0.; + complex q__1={0.,0.}, q__2={0.,0.}; /* Local variables */ integer j, k; diff --git a/lapack-netlib/SRC/clahef_rk.c b/lapack-netlib/SRC/clahef_rk.c index a841b2515e..088c40e218 100644 --- a/lapack-netlib/SRC/clahef_rk.c +++ b/lapack-netlib/SRC/clahef_rk.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2; - complex q__1, q__2, q__3, q__4, q__5; + real r__1=0., r__2=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}, q__5={0.,0.}; /* Local variables */ logical done; diff --git a/lapack-netlib/SRC/clahef_rook.c b/lapack-netlib/SRC/clahef_rook.c index f77aedc3c3..30bbeacf60 100644 --- a/lapack-netlib/SRC/clahef_rook.c +++ b/lapack-netlib/SRC/clahef_rook.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2; - complex q__1, q__2, q__3, q__4, q__5; + real r__1=0., r__2=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}, q__5={0.,0.}; /* Local variables */ logical done; diff --git a/lapack-netlib/SRC/clahqr.c b/lapack-netlib/SRC/clahqr.c index 05aaa484d9..c60709699f 100644 --- a/lapack-netlib/SRC/clahqr.c +++ b/lapack-netlib/SRC/clahqr.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -377,19 +377,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer h_dim1, h_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2, r__3, r__4, r__5, r__6; - complex q__1, q__2, q__3, q__4, q__5, q__6, q__7; + real r__1=0., r__2=0., r__3=0., r__4=0., r__5=0., r__6=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}, q__5={0.,0.}, q__6={0.,0.}, q__7={0.,0.}; /* Local variables */ complex temp; diff --git a/lapack-netlib/SRC/clahr2.c b/lapack-netlib/SRC/clahr2.c index f3877b07da..dbe85f13e6 100644 --- a/lapack-netlib/SRC/clahr2.c +++ b/lapack-netlib/SRC/clahr2.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer a_dim1, a_offset, t_dim1, t_offset, y_dim1, y_offset, i__1, i__2, i__3; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer i__; diff --git a/lapack-netlib/SRC/claic1.c b/lapack-netlib/SRC/claic1.c index 0c51d41d21..f807d77464 100644 --- a/lapack-netlib/SRC/claic1.c +++ b/lapack-netlib/SRC/claic1.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ complex *w, complex *gamma, real *sestpr, complex *s, complex *c__) { /* System generated locals */ - real r__1, r__2; - complex q__1, q__2, q__3, q__4, q__5, q__6; + real r__1=0., r__2=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}, q__5={0.,0.}, q__6={0.,0.}; /* Local variables */ complex sine; diff --git a/lapack-netlib/SRC/clals0.c b/lapack-netlib/SRC/clals0.c index c37066cacc..a405cf6dac 100644 --- a/lapack-netlib/SRC/clals0.c +++ b/lapack-netlib/SRC/clals0.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ integer givcol_dim1, givcol_offset, difr_dim1, difr_offset, givnum_dim1, givnum_offset, poles_dim1, poles_offset, b_dim1, b_offset, bx_dim1, bx_offset, i__1, i__2, i__3, i__4, i__5; - real r__1; - complex q__1; + real r__1=0.; + complex q__1={0.,0.}; /* Local variables */ integer jcol; diff --git a/lapack-netlib/SRC/clalsa.c b/lapack-netlib/SRC/clalsa.c index 2d8d187577..13233c8e00 100644 --- a/lapack-netlib/SRC/clalsa.c +++ b/lapack-netlib/SRC/clalsa.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ poles_dim1, poles_offset, u_dim1, u_offset, vt_dim1, vt_offset, z_dim1, z_offset, b_dim1, b_offset, bx_dim1, bx_offset, i__1, i__2, i__3, i__4, i__5, i__6; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer jcol, nlvl, sqre, jrow, i__, j, jimag, jreal, inode, ndiml; diff --git a/lapack-netlib/SRC/clalsd.c b/lapack-netlib/SRC/clalsd.c index 3c0ac35ebe..555322ab27 100644 --- a/lapack-netlib/SRC/clalsd.c +++ b/lapack-netlib/SRC/clalsd.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer b_dim1, b_offset, i__1, i__2, i__3, i__4, i__5, i__6; - real r__1; - complex q__1; + real r__1=0.; + complex q__1={0.,0.}; /* Local variables */ integer difl, difr; diff --git a/lapack-netlib/SRC/clamswlq.c b/lapack-netlib/SRC/clamswlq.c index cc666294bb..8703deaae3 100644 --- a/lapack-netlib/SRC/clamswlq.c +++ b/lapack-netlib/SRC/clamswlq.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer i__1; - real r__1, r__2, r__3; - complex q__1, q__2, q__3, q__4; + real r__1=0., r__2=0., r__3=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}; /* Local variables */ extern /* Subroutine */ void slas2_(real *, real *, real *, real *, real *) diff --git a/lapack-netlib/SRC/clapmr.c b/lapack-netlib/SRC/clapmr.c index d611167b9e..5720866029 100644 --- a/lapack-netlib/SRC/clapmr.c +++ b/lapack-netlib/SRC/clapmr.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer ab_dim1, ab_offset, i__1, i__2, i__3, i__4, i__5, i__6; - real r__1; - complex q__1; + real r__1=0.; + complex q__1={0.,0.}; /* Local variables */ integer i__, j; diff --git a/lapack-netlib/SRC/claqge.c b/lapack-netlib/SRC/claqge.c index dcedde2a7c..239d00113f 100644 --- a/lapack-netlib/SRC/claqge.c +++ b/lapack-netlib/SRC/claqge.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; - real r__1; - complex q__1; + real r__1=0.; + complex q__1={0.,0.}; /* Local variables */ integer i__, j; diff --git a/lapack-netlib/SRC/claqhb.c b/lapack-netlib/SRC/claqhb.c index 3f6741b991..d4a8d6a4f0 100644 --- a/lapack-netlib/SRC/claqhb.c +++ b/lapack-netlib/SRC/claqhb.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer ab_dim1, ab_offset, i__1, i__2, i__3, i__4; - real r__1; - complex q__1; + real r__1=0.; + complex q__1={0.,0.}; /* Local variables */ integer i__, j; diff --git a/lapack-netlib/SRC/claqhe.c b/lapack-netlib/SRC/claqhe.c index c41f85c984..466e48c764 100644 --- a/lapack-netlib/SRC/claqhe.c +++ b/lapack-netlib/SRC/claqhe.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4; - real r__1; - complex q__1; + real r__1=0.; + complex q__1={0.,0.}; /* Local variables */ integer i__, j; diff --git a/lapack-netlib/SRC/claqhp.c b/lapack-netlib/SRC/claqhp.c index 8d7b1987e5..d857a1c55c 100644 --- a/lapack-netlib/SRC/claqhp.c +++ b/lapack-netlib/SRC/claqhp.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer i__1, i__2, i__3, i__4; - real r__1; - complex q__1; + real r__1=0.; + complex q__1={0.,0.}; /* Local variables */ integer i__, j; diff --git a/lapack-netlib/SRC/claqp2.c b/lapack-netlib/SRC/claqp2.c index 6e7ff04c10..5e80d30520 100644 --- a/lapack-netlib/SRC/claqp2.c +++ b/lapack-netlib/SRC/claqp2.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3; - real r__1; - complex q__1; + real r__1=0.; + complex q__1={0.,0.}; /* Local variables */ real temp, temp2; diff --git a/lapack-netlib/SRC/claqp2rk.c b/lapack-netlib/SRC/claqp2rk.c index 9e64e24289..e45f409608 100644 --- a/lapack-netlib/SRC/claqp2rk.c +++ b/lapack-netlib/SRC/claqp2rk.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, f_dim1, f_offset, i__1, i__2, i__3; - real r__1, r__2; - complex q__1; + real r__1=0., r__2=0.; + complex q__1={0.,0.}; /* Local variables */ real temp, temp2; diff --git a/lapack-netlib/SRC/claqr0.c b/lapack-netlib/SRC/claqr0.c index 25e589b239..5ed19e327b 100644 --- a/lapack-netlib/SRC/claqr0.c +++ b/lapack-netlib/SRC/claqr0.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer h_dim1, h_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2, r__3, r__4, r__5, r__6, r__7, r__8; - complex q__1, q__2, q__3, q__4, q__5; + real r__1=0., r__2=0., r__3=0., r__4=0., r__5=0., r__6=0., r__7=0., r__8=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}, q__5={0.,0.}; /* Local variables */ integer ndec, ndfl, kbot, nmin; diff --git a/lapack-netlib/SRC/claqr1.c b/lapack-netlib/SRC/claqr1.c index b69a6f635d..f11975379b 100644 --- a/lapack-netlib/SRC/claqr1.c +++ b/lapack-netlib/SRC/claqr1.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer h_dim1, h_offset, i__1, i__2, i__3, i__4; - real r__1, r__2, r__3, r__4, r__5, r__6; - complex q__1, q__2, q__3, q__4, q__5, q__6, q__7, q__8; + real r__1=0., r__2=0., r__3=0., r__4=0., r__5=0., r__6=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}, q__5={0.,0.}, q__6={0.,0.}, q__7={0.,0.}, q__8={0.,0.}; /* Local variables */ real s; diff --git a/lapack-netlib/SRC/claqr2.c b/lapack-netlib/SRC/claqr2.c index ee1f8050b6..2c36f088f7 100644 --- a/lapack-netlib/SRC/claqr2.c +++ b/lapack-netlib/SRC/claqr2.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer h_dim1, h_offset, t_dim1, t_offset, v_dim1, v_offset, wv_dim1, wv_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4; - real r__1, r__2, r__3, r__4, r__5, r__6; - complex q__1, q__2; + real r__1=0., r__2=0., r__3=0., r__4=0., r__5=0., r__6=0.; + complex q__1={0.,0.}, q__2={0.,0.}; /* Local variables */ complex beta; diff --git a/lapack-netlib/SRC/claqr3.c b/lapack-netlib/SRC/claqr3.c index e0c84e6c86..e68055594e 100644 --- a/lapack-netlib/SRC/claqr3.c +++ b/lapack-netlib/SRC/claqr3.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer h_dim1, h_offset, t_dim1, t_offset, v_dim1, v_offset, wv_dim1, wv_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4; - real r__1, r__2, r__3, r__4, r__5, r__6; - complex q__1, q__2; + real r__1=0., r__2=0., r__3=0., r__4=0., r__5=0., r__6=0.; + complex q__1={0.,0.}, q__2={0.,0.}; /* Local variables */ complex beta; diff --git a/lapack-netlib/SRC/claqr4.c b/lapack-netlib/SRC/claqr4.c index 3ebe723ba9..b978fedf21 100644 --- a/lapack-netlib/SRC/claqr4.c +++ b/lapack-netlib/SRC/claqr4.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer h_dim1, h_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2, r__3, r__4, r__5, r__6, r__7, r__8; - complex q__1, q__2, q__3, q__4, q__5; + real r__1=0., r__2=0., r__3=0., r__4=0., r__5=0., r__6=0., r__7=0., r__8=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}, q__5={0.,0.}; /* Local variables */ integer ndec, ndfl, kbot, nmin; diff --git a/lapack-netlib/SRC/claqr5.c b/lapack-netlib/SRC/claqr5.c index e464801449..f1ff8a0931 100644 --- a/lapack-netlib/SRC/claqr5.c +++ b/lapack-netlib/SRC/claqr5.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ integer h_dim1, h_offset, u_dim1, u_offset, v_dim1, v_offset, wh_dim1, wh_offset, wv_dim1, wv_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8, i__9, i__10, i__11; - real r__1, r__2, r__3, r__4, r__5, r__6, r__7, r__8, r__9, r__10; - complex q__1, q__2, q__3, q__4, q__5, q__6, q__7, q__8; + real r__1=0., r__2=0., r__3=0., r__4=0., r__5=0., r__6=0., r__7=0., r__8=0., r__9=0., r__10=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}, q__5={0.,0.}, q__6={0.,0.}, q__7={0.,0.}, q__8={0.,0.}; /* Local variables */ complex beta; diff --git a/lapack-netlib/SRC/claqsb.c b/lapack-netlib/SRC/claqsb.c index a661a762ba..efe24415b0 100644 --- a/lapack-netlib/SRC/claqsb.c +++ b/lapack-netlib/SRC/claqsb.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer ab_dim1, ab_offset, i__1, i__2, i__3, i__4; - real r__1; - complex q__1; + real r__1=0.; + complex q__1={0.,0.}; /* Local variables */ integer i__, j; diff --git a/lapack-netlib/SRC/claqsp.c b/lapack-netlib/SRC/claqsp.c index 0eea85904f..25792feb4c 100644 --- a/lapack-netlib/SRC/claqsp.c +++ b/lapack-netlib/SRC/claqsp.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer i__1, i__2, i__3, i__4; - real r__1; - complex q__1; + real r__1=0.; + complex q__1={0.,0.}; /* Local variables */ integer i__, j; diff --git a/lapack-netlib/SRC/claqsy.c b/lapack-netlib/SRC/claqsy.c index d332c59a8d..98523d11ad 100644 --- a/lapack-netlib/SRC/claqsy.c +++ b/lapack-netlib/SRC/claqsy.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4; - real r__1; - complex q__1; + real r__1=0.; + complex q__1={0.,0.}; /* Local variables */ integer i__, j; diff --git a/lapack-netlib/SRC/clar1v.c b/lapack-netlib/SRC/clar1v.c index f96f97d411..07965e4af4 100644 --- a/lapack-netlib/SRC/clar1v.c +++ b/lapack-netlib/SRC/clar1v.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer i__1, i__2, i__3, i__4; - real r__1; - complex q__1, q__2; + real r__1=0.; + complex q__1={0.,0.}, q__2={0.,0.}; /* Local variables */ integer indp, inds, i__; diff --git a/lapack-netlib/SRC/clar2v.c b/lapack-netlib/SRC/clar2v.c index 8e0c1a0b05..9f3277129a 100644 --- a/lapack-netlib/SRC/clar2v.c +++ b/lapack-netlib/SRC/clar2v.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer i__1, i__2; - real r__1; - complex q__1, q__2, q__3, q__4, q__5; + real r__1=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}, q__5={0.,0.}; /* Local variables */ integer i__; diff --git a/lapack-netlib/SRC/clarcm.c b/lapack-netlib/SRC/clarcm.c index 6bc44f9d0b..97bae6d170 100644 --- a/lapack-netlib/SRC/clarcm.c +++ b/lapack-netlib/SRC/clarcm.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5; - real r__1; - complex q__1; + real r__1=0.; + complex q__1={0.,0.}; /* Local variables */ integer i__, j, l; diff --git a/lapack-netlib/SRC/clarf.c b/lapack-netlib/SRC/clarf.c index 770f94b188..08dbd774a8 100644 --- a/lapack-netlib/SRC/clarf.c +++ b/lapack-netlib/SRC/clarf.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer c_dim1, c_offset, t_dim1, t_offset, v_dim1, v_offset, work_dim1, work_offset, i__1, i__2, i__3, i__4, i__5; - complex q__1, q__2; + complex q__1={0.,0.}, q__2={0.,0.}; /* Local variables */ integer i__, j; diff --git a/lapack-netlib/SRC/clarfb_gett.c b/lapack-netlib/SRC/clarfb_gett.c index 5cca69d24c..273eaed76c 100644 --- a/lapack-netlib/SRC/clarfb_gett.c +++ b/lapack-netlib/SRC/clarfb_gett.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, t_dim1, t_offset, work_dim1, work_offset, i__1, i__2, i__3, i__4, i__5; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer i__, j; diff --git a/lapack-netlib/SRC/clarfg.c b/lapack-netlib/SRC/clarfg.c index 9deed53d85..62962222a6 100644 --- a/lapack-netlib/SRC/clarfg.c +++ b/lapack-netlib/SRC/clarfg.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer i__1; - real r__1, r__2; - complex q__1, q__2; + real r__1=0., r__2=0.; + complex q__1={0.,0.}, q__2={0.,0.}; /* Local variables */ real beta; diff --git a/lapack-netlib/SRC/clarfgp.c b/lapack-netlib/SRC/clarfgp.c index 5de4728f47..3751bc720d 100644 --- a/lapack-netlib/SRC/clarfgp.c +++ b/lapack-netlib/SRC/clarfgp.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer t_dim1, t_offset, v_dim1, v_offset, i__1, i__2, i__3, i__4, i__5; - complex q__1, q__2, q__3; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}; /* Local variables */ integer i__, j; diff --git a/lapack-netlib/SRC/clarfx.c b/lapack-netlib/SRC/clarfx.c index 4f70bb97ec..e3e507d29d 100644 --- a/lapack-netlib/SRC/clarfx.c +++ b/lapack-netlib/SRC/clarfx.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8, i__9, i__10, i__11; - complex q__1, q__2, q__3, q__4, q__5, q__6, q__7, q__8, q__9, q__10, - q__11, q__12, q__13, q__14, q__15, q__16, q__17, q__18, q__19; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}, q__5={0.,0.}, q__6={0.,0.}, q__7={0.,0.}, q__8={0.,0.}, q__9={0.,0.}, q__10={0.,0.}, + q__11={0.,0.}, q__12={0.,0.}, q__13={0.,0.}, q__14={0.,0.}, q__15={0.,0.}, q__16={0.,0.}, q__17={0.,0.}, q__18={0.,0.}, q__19={0.,0.}; /* Local variables */ integer j; diff --git a/lapack-netlib/SRC/clarfy.c b/lapack-netlib/SRC/clarfy.c index 086902c47f..02137c0f5f 100644 --- a/lapack-netlib/SRC/clarfy.c +++ b/lapack-netlib/SRC/clarfy.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer i__1, i__2; - real r__1, r__2, r__3, r__4, r__5, r__6, r__7, r__8, r__9, r__10; - complex q__1, q__2, q__3; + real r__1=0., r__2=0., r__3=0., r__4=0., r__5=0., r__6=0., r__7=0., r__8=0., r__9=0., r__10=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}; /* Local variables */ real d__; diff --git a/lapack-netlib/SRC/clarnv.c b/lapack-netlib/SRC/clarnv.c index e42cc0caff..870eeb4b28 100644 --- a/lapack-netlib/SRC/clarnv.c +++ b/lapack-netlib/SRC/clarnv.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer i__1, i__2, i__3, i__4, i__5; - real r__1, r__2; - complex q__1, q__2, q__3; + real r__1=0., r__2=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}; /* Local variables */ integer i__; diff --git a/lapack-netlib/SRC/clarrv.c b/lapack-netlib/SRC/clarrv.c index 92b3aa08c5..3eccb6f2b3 100644 --- a/lapack-netlib/SRC/clarrv.c +++ b/lapack-netlib/SRC/clarrv.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5, i__6; - real r__1, r__2; - complex q__1; + real r__1=0., r__2=0.; + complex q__1={0.,0.}; logical L__1; /* Local variables */ diff --git a/lapack-netlib/SRC/clarscl2.c b/lapack-netlib/SRC/clarscl2.c index 48641bc923..1c696fc42d 100644 --- a/lapack-netlib/SRC/clarscl2.c +++ b/lapack-netlib/SRC/clarscl2.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer i__1; - real r__1, r__2, r__3, r__4, r__5, r__6, r__7, r__8, r__9, r__10; - complex q__1, q__2, q__3; + real r__1=0., r__2=0., r__3=0., r__4=0., r__5=0., r__6=0., r__7=0., r__8=0., r__9=0., r__10=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}; /* Local variables */ real d__; diff --git a/lapack-netlib/SRC/clartv.c b/lapack-netlib/SRC/clartv.c index bbbe553d21..8e036a8fad 100644 --- a/lapack-netlib/SRC/clartv.c +++ b/lapack-netlib/SRC/clartv.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer i__1, i__2, i__3, i__4; - complex q__1, q__2, q__3, q__4; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}; /* Local variables */ integer i__, ic, ix, iy; diff --git a/lapack-netlib/SRC/clarz.c b/lapack-netlib/SRC/clarz.c index dbffdec65e..022b0a55c2 100644 --- a/lapack-netlib/SRC/clarz.c +++ b/lapack-netlib/SRC/clarz.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer c_dim1, c_offset, t_dim1, t_offset, v_dim1, v_offset, work_dim1, work_offset, i__1, i__2, i__3, i__4, i__5; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer info, i__, j; diff --git a/lapack-netlib/SRC/clarzt.c b/lapack-netlib/SRC/clarzt.c index 09fbc5efcf..97665aaf6b 100644 --- a/lapack-netlib/SRC/clarzt.c +++ b/lapack-netlib/SRC/clarzt.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer t_dim1, t_offset, v_dim1, v_offset, i__1, i__2; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer info, i__, j; diff --git a/lapack-netlib/SRC/clascl.c b/lapack-netlib/SRC/clascl.c index 31be8d40d6..91479c26bc 100644 --- a/lapack-netlib/SRC/clascl.c +++ b/lapack-netlib/SRC/clascl.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ logical done; diff --git a/lapack-netlib/SRC/clascl2.c b/lapack-netlib/SRC/clascl2.c index fada47968e..574ada3779 100644 --- a/lapack-netlib/SRC/clascl2.c +++ b/lapack-netlib/SRC/clascl2.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2, r__3, r__4; - complex q__1, q__2, q__3; + real r__1=0., r__2=0., r__3=0., r__4=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}; /* Local variables */ integer imax, jmax, j, k; diff --git a/lapack-netlib/SRC/clasyf_aa.c b/lapack-netlib/SRC/clasyf_aa.c index 9f986aedfe..7f69fd5519 100644 --- a/lapack-netlib/SRC/clasyf_aa.c +++ b/lapack-netlib/SRC/clasyf_aa.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, h_dim1, h_offset, i__1, i__2; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer j, k; diff --git a/lapack-netlib/SRC/clasyf_rk.c b/lapack-netlib/SRC/clasyf_rk.c index e592a19c3d..4d03e9d8dd 100644 --- a/lapack-netlib/SRC/clasyf_rk.c +++ b/lapack-netlib/SRC/clasyf_rk.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2; - complex q__1, q__2, q__3, q__4; + real r__1=0., r__2=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}; /* Local variables */ logical done; diff --git a/lapack-netlib/SRC/clasyf_rook.c b/lapack-netlib/SRC/clasyf_rook.c index 1ed592bd4c..2eb8ccd817 100644 --- a/lapack-netlib/SRC/clasyf_rook.c +++ b/lapack-netlib/SRC/clasyf_rook.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2; - complex q__1, q__2, q__3, q__4; + real r__1=0., r__2=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}; /* Local variables */ logical done; diff --git a/lapack-netlib/SRC/clatbs.c b/lapack-netlib/SRC/clatbs.c index df8e5e02e8..2a61201c65 100644 --- a/lapack-netlib/SRC/clatbs.c +++ b/lapack-netlib/SRC/clatbs.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer ab_dim1, ab_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2, r__3, r__4; - complex q__1, q__2, q__3, q__4; + real r__1=0., r__2=0., r__3=0., r__4=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}; /* Local variables */ integer jinc, jlen; diff --git a/lapack-netlib/SRC/clatdf.c b/lapack-netlib/SRC/clatdf.c index 7ab67315c9..2724fbccce 100644 --- a/lapack-netlib/SRC/clatdf.c +++ b/lapack-netlib/SRC/clatdf.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5; - complex q__1, q__2, q__3; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}; /* Local variables */ integer info; diff --git a/lapack-netlib/SRC/clatps.c b/lapack-netlib/SRC/clatps.c index ff8f2172ec..fc4023bb6b 100644 --- a/lapack-netlib/SRC/clatps.c +++ b/lapack-netlib/SRC/clatps.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer i__1, i__2, i__3, i__4, i__5; - real r__1, r__2, r__3, r__4; - complex q__1, q__2, q__3, q__4; + real r__1=0., r__2=0., r__3=0., r__4=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}; /* Local variables */ integer jinc, jlen; diff --git a/lapack-netlib/SRC/clatrd.c b/lapack-netlib/SRC/clatrd.c index 3e83285224..c23e185451 100644 --- a/lapack-netlib/SRC/clatrd.c +++ b/lapack-netlib/SRC/clatrd.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3; - real r__1; - complex q__1, q__2, q__3, q__4; + real r__1=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}; /* Local variables */ integer i__; diff --git a/lapack-netlib/SRC/clatrs.c b/lapack-netlib/SRC/clatrs.c index 9363e028b6..ea81651811 100644 --- a/lapack-netlib/SRC/clatrs.c +++ b/lapack-netlib/SRC/clatrs.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2, r__3, r__4; - complex q__1, q__2, q__3, q__4; + real r__1=0., r__2=0., r__3=0., r__4=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}; /* Local variables */ integer jinc; diff --git a/lapack-netlib/SRC/clatrs3.c b/lapack-netlib/SRC/clatrs3.c index 91bd5cdfdf..b84009871c 100644 --- a/lapack-netlib/SRC/clatrs3.c +++ b/lapack-netlib/SRC/clatrs3.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -381,19 +381,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer i__; diff --git a/lapack-netlib/SRC/clatsqr.c b/lapack-netlib/SRC/clatsqr.c index 42576d127b..aff5f82b6c 100644 --- a/lapack-netlib/SRC/clatsqr.c +++ b/lapack-netlib/SRC/clatsqr.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ extern /* Subroutine */ void claunhr_col_getrfnp2_(integer *, integer *, diff --git a/lapack-netlib/SRC/claunhr_col_getrfnp2.c b/lapack-netlib/SRC/claunhr_col_getrfnp2.c index 6b405e2557..ef0e1a4da9 100644 --- a/lapack-netlib/SRC/claunhr_col_getrfnp2.c +++ b/lapack-netlib/SRC/claunhr_col_getrfnp2.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2; - real r__1, r__2; - complex q__1; + real r__1=0., r__2=0.; + complex q__1={0.,0.}; /* Local variables */ integer i__; diff --git a/lapack-netlib/SRC/clauu2.c b/lapack-netlib/SRC/clauu2.c index 07c0213886..e7738623c3 100644 --- a/lapack-netlib/SRC/clauu2.c +++ b/lapack-netlib/SRC/clauu2.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3; - real r__1; - complex q__1; + real r__1=0.; + complex q__1={0.,0.}; /* Local variables */ integer i__; diff --git a/lapack-netlib/SRC/clauum.c b/lapack-netlib/SRC/clauum.c index e0e9c992c4..0bae06403d 100644 --- a/lapack-netlib/SRC/clauum.c +++ b/lapack-netlib/SRC/clauum.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer ab_dim1, ab_offset, afb_dim1, afb_offset, b_dim1, b_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2, r__3, r__4; - complex q__1; + real r__1=0., r__2=0., r__3=0., r__4=0.; + complex q__1={0.,0.}; /* Local variables */ integer kase; diff --git a/lapack-netlib/SRC/cpbstf.c b/lapack-netlib/SRC/cpbstf.c index ceb0a4d278..f527e4eeb7 100644 --- a/lapack-netlib/SRC/cpbstf.c +++ b/lapack-netlib/SRC/cpbstf.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer ab_dim1, ab_offset, afb_dim1, afb_offset, b_dim1, b_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2; - complex q__1; + real r__1=0., r__2=0.; + complex q__1={0.,0.}; /* Local variables */ real amax, smin, smax; diff --git a/lapack-netlib/SRC/cpbtf2.c b/lapack-netlib/SRC/cpbtf2.c index e2cfb6ad9f..928e82537e 100644 --- a/lapack-netlib/SRC/cpbtf2.c +++ b/lapack-netlib/SRC/cpbtf2.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer ab_dim1, ab_offset, i__1, i__2, i__3, i__4, i__5, i__6; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ complex work[1056] /* was [33][32] */; diff --git a/lapack-netlib/SRC/cpbtrs.c b/lapack-netlib/SRC/cpbtrs.c index 24939ae44d..c2e3ca34e1 100644 --- a/lapack-netlib/SRC/cpbtrs.c +++ b/lapack-netlib/SRC/cpbtrs.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer a_dim1, a_offset, af_dim1, af_offset, b_dim1, b_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2, r__3, r__4; - complex q__1; + real r__1=0., r__2=0., r__3=0., r__4=0.; + complex q__1={0.,0.}; /* Local variables */ integer kase; diff --git a/lapack-netlib/SRC/cposv.c b/lapack-netlib/SRC/cposv.c index 2616194fc3..398fa2eac1 100644 --- a/lapack-netlib/SRC/cposv.c +++ b/lapack-netlib/SRC/cposv.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer a_dim1, a_offset, af_dim1, af_offset, b_dim1, b_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2; - complex q__1; + real r__1=0., r__2=0.; + complex q__1={0.,0.}; /* Local variables */ real amax, smin, smax; diff --git a/lapack-netlib/SRC/cposvxx.c b/lapack-netlib/SRC/cposvxx.c index 6f64b6eb3f..362def66c3 100644 --- a/lapack-netlib/SRC/cposvxx.c +++ b/lapack-netlib/SRC/cposvxx.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3; - real r__1; - complex q__1, q__2; + real r__1=0.; + complex q__1={0.,0.}, q__2={0.,0.}; /* Local variables */ integer j; diff --git a/lapack-netlib/SRC/cpotrf.c b/lapack-netlib/SRC/cpotrf.c index 155302fd9d..aa3a5ad190 100644 --- a/lapack-netlib/SRC/cpotrf.c +++ b/lapack-netlib/SRC/cpotrf.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer j; diff --git a/lapack-netlib/SRC/cpotrf2.c b/lapack-netlib/SRC/cpotrf2.c index a8b2891612..27d62fb4d8 100644 --- a/lapack-netlib/SRC/cpotrf2.c +++ b/lapack-netlib/SRC/cpotrf2.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer b_dim1, b_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2, r__3, r__4; - complex q__1; + real r__1=0., r__2=0., r__3=0., r__4=0.; + complex q__1={0.,0.}; /* Local variables */ integer kase; diff --git a/lapack-netlib/SRC/cppsv.c b/lapack-netlib/SRC/cppsv.c index e9320bf1fa..9feeb448f9 100644 --- a/lapack-netlib/SRC/cppsv.c +++ b/lapack-netlib/SRC/cppsv.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer b_dim1, b_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2; - complex q__1; + real r__1=0., r__2=0.; + complex q__1={0.,0.}; /* Local variables */ real amax, smin, smax; diff --git a/lapack-netlib/SRC/cpptrf.c b/lapack-netlib/SRC/cpptrf.c index 1f2a4a84ee..f4cb4b5fd4 100644 --- a/lapack-netlib/SRC/cpptrf.c +++ b/lapack-netlib/SRC/cpptrf.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer i__1, i__2, i__3; - real r__1; - complex q__1, q__2; + real r__1=0.; + complex q__1={0.,0.}, q__2={0.,0.}; /* Local variables */ extern /* Subroutine */ void chpr_(char *, integer *, real *, complex *, diff --git a/lapack-netlib/SRC/cpptri.c b/lapack-netlib/SRC/cpptri.c index db77738622..43f4a17ecc 100644 --- a/lapack-netlib/SRC/cpptri.c +++ b/lapack-netlib/SRC/cpptri.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer i__1, i__2, i__3; - real r__1; - complex q__1; + real r__1=0.; + complex q__1={0.,0.}; /* Local variables */ extern /* Subroutine */ void chpr_(char *, integer *, real *, complex *, diff --git a/lapack-netlib/SRC/cpptrs.c b/lapack-netlib/SRC/cpptrs.c index dadb90ce2d..165441ca8d 100644 --- a/lapack-netlib/SRC/cpptrs.c +++ b/lapack-netlib/SRC/cpptrs.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3; - real r__1; - complex q__1, q__2; + real r__1=0.; + complex q__1={0.,0.}, q__2={0.,0.}; /* Local variables */ diff --git a/lapack-netlib/SRC/cpstrf.c b/lapack-netlib/SRC/cpstrf.c index a271835c25..86187113dc 100644 --- a/lapack-netlib/SRC/cpstrf.c +++ b/lapack-netlib/SRC/cpstrf.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; - real r__1; - complex q__1, q__2; + real r__1=0.; + complex q__1={0.,0.}, q__2={0.,0.}; /* Local variables */ diff --git a/lapack-netlib/SRC/cptcon.c b/lapack-netlib/SRC/cptcon.c index 3920fd2da9..2f80af3065 100644 --- a/lapack-netlib/SRC/cptcon.c +++ b/lapack-netlib/SRC/cptcon.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer b_dim1, b_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5, i__6; - real r__1, r__2, r__3, r__4, r__5, r__6, r__7, r__8, r__9, r__10, r__11, - r__12; - complex q__1, q__2, q__3; + real r__1=0., r__2=0., r__3=0., r__4=0., r__5=0., r__6=0., r__7=0., r__8=0., r__9=0., r__10=0., r__11=0., + r__12=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}; /* Local variables */ real safe1, safe2; diff --git a/lapack-netlib/SRC/cptsv.c b/lapack-netlib/SRC/cptsv.c index 2d0ece1f4d..9392bfa66d 100644 --- a/lapack-netlib/SRC/cptsv.c +++ b/lapack-netlib/SRC/cptsv.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer i__1, i__2; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ real f, g; diff --git a/lapack-netlib/SRC/cpttrs.c b/lapack-netlib/SRC/cpttrs.c index 474596e6f0..5c9c0307c0 100644 --- a/lapack-netlib/SRC/cpttrs.c +++ b/lapack-netlib/SRC/cpttrs.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer b_dim1, b_offset, i__1, i__2, i__3, i__4, i__5, i__6; - real r__1; - complex q__1, q__2, q__3, q__4; + real r__1=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}; /* Local variables */ integer i__, j; diff --git a/lapack-netlib/SRC/crot.c b/lapack-netlib/SRC/crot.c index f001028fb6..a9e9df4fc1 100644 --- a/lapack-netlib/SRC/crot.c +++ b/lapack-netlib/SRC/crot.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer b_dim1, b_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2, r__3, r__4; - complex q__1; + real r__1=0., r__2=0., r__3=0., r__4=0.; + complex q__1={0.,0.}; /* Local variables */ integer kase; diff --git a/lapack-netlib/SRC/cspsv.c b/lapack-netlib/SRC/cspsv.c index 253059704b..3571e707b1 100644 --- a/lapack-netlib/SRC/cspsv.c +++ b/lapack-netlib/SRC/cspsv.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer i__1, i__2, i__3, i__4, i__5, i__6; - real r__1, r__2, r__3, r__4; - complex q__1, q__2, q__3, q__4; + real r__1=0., r__2=0., r__3=0., r__4=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}; /* Local variables */ integer imax, jmax; diff --git a/lapack-netlib/SRC/csptri.c b/lapack-netlib/SRC/csptri.c index 57ade83260..1c0a0f244c 100644 --- a/lapack-netlib/SRC/csptri.c +++ b/lapack-netlib/SRC/csptri.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer i__1, i__2, i__3; - complex q__1, q__2, q__3; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}; /* Local variables */ complex temp, akkp1, d__; diff --git a/lapack-netlib/SRC/csptrs.c b/lapack-netlib/SRC/csptrs.c index ac3098a987..7457843512 100644 --- a/lapack-netlib/SRC/csptrs.c +++ b/lapack-netlib/SRC/csptrs.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer b_dim1, b_offset, i__1, i__2; - complex q__1, q__2, q__3; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}; /* Local variables */ complex akm1k; diff --git a/lapack-netlib/SRC/csrscl.c b/lapack-netlib/SRC/csrscl.c index e82571d081..3b726c5134 100644 --- a/lapack-netlib/SRC/csrscl.c +++ b/lapack-netlib/SRC/csrscl.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i \htmlonly *> Download CSRSCL + dependencies *> *> [TGZ] @@ -13,7 +12,6 @@ *> [ZIP] *> *> [TXT] -*> \endhtmlonly * * Definition: * =========== @@ -77,10 +75,11 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERauxiliary +*> \ingroup rscl * * ===================================================================== SUBROUTINE CSRSCL( N, SA, SX, INCX ) + IMPLICIT NONE * * -- LAPACK auxiliary routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -109,10 +108,11 @@ SUBROUTINE CSRSCL( N, SA, SX, INCX ) EXTERNAL SLAMCH * .. * .. External Subroutines .. - EXTERNAL CSSCAL, SLABAD + EXTERNAL CSSCAL * .. * .. Intrinsic Functions .. INTRINSIC ABS + INTRINSIC HUGE * .. * .. Executable Statements .. * @@ -120,12 +120,16 @@ SUBROUTINE CSRSCL( N, SA, SX, INCX ) * IF( N.LE.0 ) $ RETURN +* + IF( SA.GT.HUGE(SA) .OR. SA.LT.-HUGE(SA) ) THEN + CALL CSSCAL( N, SA, SX, INCX ) + RETURN + END IF * * Get machine parameters * SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) * * Initialize the denominator to SA and the numerator to 1. * diff --git a/lapack-netlib/SRC/cstedc.c b/lapack-netlib/SRC/cstedc.c index 9d60c9aa92..459f95dd48 100644 --- a/lapack-netlib/SRC/cstedc.c +++ b/lapack-netlib/SRC/cstedc.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2, r__3, r__4, r__5; - complex q__1; + real r__1=0., r__2=0., r__3=0., r__4=0., r__5=0.; + complex q__1={0.,0.}; /* Local variables */ integer jblk, nblk, jmax; diff --git a/lapack-netlib/SRC/cstemr.c b/lapack-netlib/SRC/cstemr.c index b817c760e4..cf214aeebc 100644 --- a/lapack-netlib/SRC/cstemr.c +++ b/lapack-netlib/SRC/cstemr.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer a_dim1, a_offset, af_dim1, af_offset, b_dim1, b_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2, r__3, r__4; - complex q__1; + real r__1=0., r__2=0., r__3=0., r__4=0.; + complex q__1={0.,0.}; /* Local variables */ integer kase; diff --git a/lapack-netlib/SRC/csysv.c b/lapack-netlib/SRC/csysv.c index e52875c7b7..02be9cd009 100644 --- a/lapack-netlib/SRC/csysv.c +++ b/lapack-netlib/SRC/csysv.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6; - real r__1, r__2, r__3, r__4; - complex q__1, q__2, q__3, q__4; + real r__1=0., r__2=0., r__3=0., r__4=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}; /* Local variables */ integer imax, jmax; diff --git a/lapack-netlib/SRC/csytf2_rk.c b/lapack-netlib/SRC/csytf2_rk.c index 018e64bdf8..cbf1d01752 100644 --- a/lapack-netlib/SRC/csytf2_rk.c +++ b/lapack-netlib/SRC/csytf2_rk.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4; - real r__1, r__2; - complex q__1, q__2, q__3, q__4, q__5, q__6; + real r__1=0., r__2=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}, q__5={0.,0.}, q__6={0.,0.}; /* Local variables */ logical done; diff --git a/lapack-netlib/SRC/csytf2_rook.c b/lapack-netlib/SRC/csytf2_rook.c index 08de556d68..9c1fc08a0a 100644 --- a/lapack-netlib/SRC/csytf2_rook.c +++ b/lapack-netlib/SRC/csytf2_rook.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4; - real r__1, r__2; - complex q__1, q__2, q__3, q__4, q__5, q__6; + real r__1=0., r__2=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}, q__5={0.,0.}, q__6={0.,0.}; /* Local variables */ logical done; diff --git a/lapack-netlib/SRC/csytrf.c b/lapack-netlib/SRC/csytrf.c index 42d0d5ba5d..7bf4590423 100644 --- a/lapack-netlib/SRC/csytrf.c +++ b/lapack-netlib/SRC/csytrf.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer ldtb, i__, j, k; diff --git a/lapack-netlib/SRC/csytrf_rk.c b/lapack-netlib/SRC/csytrf_rk.c index 4871cfef0b..06dea05594 100644 --- a/lapack-netlib/SRC/csytrf_rk.c +++ b/lapack-netlib/SRC/csytrf_rk.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3; - complex q__1, q__2, q__3; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}; /* Local variables */ complex temp, akkp1, d__; diff --git a/lapack-netlib/SRC/csytri2.c b/lapack-netlib/SRC/csytri2.c index 810a94a631..8b4cc573d8 100644 --- a/lapack-netlib/SRC/csytri2.c +++ b/lapack-netlib/SRC/csytri2.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer a_dim1, a_offset, work_dim1, work_offset, i__1, i__2, i__3, i__4, i__5, i__6; - complex q__1, q__2, q__3; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}; /* Local variables */ integer invd; diff --git a/lapack-netlib/SRC/csytri_3.c b/lapack-netlib/SRC/csytri_3.c index 7939345027..28477994e6 100644 --- a/lapack-netlib/SRC/csytri_3.c +++ b/lapack-netlib/SRC/csytri_3.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3; - complex q__1, q__2, q__3; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}; /* Local variables */ complex temp, akkp1, d__; diff --git a/lapack-netlib/SRC/csytrs.c b/lapack-netlib/SRC/csytrs.c index c3b5bc3fb4..40d103138d 100644 --- a/lapack-netlib/SRC/csytrs.c +++ b/lapack-netlib/SRC/csytrs.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2; - complex q__1, q__2, q__3; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}; /* Local variables */ complex akm1k; diff --git a/lapack-netlib/SRC/csytrs2.c b/lapack-netlib/SRC/csytrs2.c index b34cba7012..8ed009e105 100644 --- a/lapack-netlib/SRC/csytrs2.c +++ b/lapack-netlib/SRC/csytrs2.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2; - complex q__1, q__2, q__3; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}; /* Local variables */ complex akm1k; diff --git a/lapack-netlib/SRC/ctbcon.c b/lapack-netlib/SRC/ctbcon.c index 77c8b2c914..a03681d271 100644 --- a/lapack-netlib/SRC/ctbcon.c +++ b/lapack-netlib/SRC/ctbcon.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer ab_dim1, ab_offset, b_dim1, b_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2, r__3, r__4; - complex q__1; + real r__1=0., r__2=0., r__3=0., r__4=0.; + complex q__1={0.,0.}; /* Local variables */ integer kase; diff --git a/lapack-netlib/SRC/ctbtrs.c b/lapack-netlib/SRC/ctbtrs.c index 215fce1516..e2f535c17e 100644 --- a/lapack-netlib/SRC/ctbtrs.c +++ b/lapack-netlib/SRC/ctbtrs.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer i__1, i__2; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer k; diff --git a/lapack-netlib/SRC/ctfttp.c b/lapack-netlib/SRC/ctfttp.c index 7b37de986f..ed5a878469 100644 --- a/lapack-netlib/SRC/ctfttp.c +++ b/lapack-netlib/SRC/ctfttp.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer i__1, i__2, i__3, i__4; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer i__, j, k; diff --git a/lapack-netlib/SRC/ctfttr.c b/lapack-netlib/SRC/ctfttr.c index 8143db540c..42dd0883dc 100644 --- a/lapack-netlib/SRC/ctfttr.c +++ b/lapack-netlib/SRC/ctfttr.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer np1x2, i__, j, k, l; diff --git a/lapack-netlib/SRC/ctgevc.c b/lapack-netlib/SRC/ctgevc.c index 157d2e6e89..fcc6ccaac1 100644 --- a/lapack-netlib/SRC/ctgevc.c +++ b/lapack-netlib/SRC/ctgevc.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer p_dim1, p_offset, s_dim1, s_offset, vl_dim1, vl_offset, vr_dim1, vr_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2, r__3, r__4, r__5, r__6; - complex q__1, q__2, q__3, q__4; + real r__1=0., r__2=0., r__3=0., r__4=0., r__5=0., r__6=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}; /* Local variables */ integer ibeg, ieig, iend; diff --git a/lapack-netlib/SRC/ctgex2.c b/lapack-netlib/SRC/ctgex2.c index 7ab23e706c..8966cbe53d 100644 --- a/lapack-netlib/SRC/ctgex2.c +++ b/lapack-netlib/SRC/ctgex2.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, q_dim1, q_offset, z_dim1, z_offset, i__1, i__2, i__3; - real r__1; - complex q__1, q__2, q__3; + real r__1=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}; /* Local variables */ logical weak; diff --git a/lapack-netlib/SRC/ctgexc.c b/lapack-netlib/SRC/ctgexc.c index 784b53c782..9664e37d8c 100644 --- a/lapack-netlib/SRC/ctgexc.c +++ b/lapack-netlib/SRC/ctgexc.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, q_dim1, q_offset, z_dim1, z_offset, i__1, i__2, i__3; - complex q__1, q__2; + complex q__1={0.,0.}, q__2={0.,0.}; /* Local variables */ integer kase, ierr; diff --git a/lapack-netlib/SRC/ctgsja.c b/lapack-netlib/SRC/ctgsja.c index bc02dee491..60a4d0b054 100644 --- a/lapack-netlib/SRC/ctgsja.c +++ b/lapack-netlib/SRC/ctgsja.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, q_dim1, q_offset, u_dim1, u_offset, v_dim1, v_offset, i__1, i__2, i__3, i__4; - real r__1; - complex q__1; + real r__1=0.; + complex q__1={0.,0.}; /* Local variables */ extern /* Subroutine */ void crot_(integer *, complex *, integer *, diff --git a/lapack-netlib/SRC/ctgsna.c b/lapack-netlib/SRC/ctgsna.c index 34f2d7717d..0e75ecd041 100644 --- a/lapack-netlib/SRC/ctgsna.c +++ b/lapack-netlib/SRC/ctgsna.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, vl_dim1, vl_offset, vr_dim1, vr_offset, i__1; - real r__1, r__2; - complex q__1; + real r__1=0., r__2=0.; + complex q__1={0.,0.}; /* Local variables */ real cond; diff --git a/lapack-netlib/SRC/ctgsy2.c b/lapack-netlib/SRC/ctgsy2.c index e28957b0a8..194bd510e7 100644 --- a/lapack-netlib/SRC/ctgsy2.c +++ b/lapack-netlib/SRC/ctgsy2.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, d_dim1, d_offset, e_dim1, e_offset, f_dim1, f_offset, i__1, i__2, i__3, i__4; - complex q__1, q__2, q__3, q__4, q__5, q__6; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}, q__5={0.,0.}, q__6={0.,0.}; /* Local variables */ integer ierr, ipiv[2], jpiv[2], i__, j, k; diff --git a/lapack-netlib/SRC/ctgsyl.c b/lapack-netlib/SRC/ctgsyl.c index 6e7bbdab55..5ce232e7dd 100644 --- a/lapack-netlib/SRC/ctgsyl.c +++ b/lapack-netlib/SRC/ctgsyl.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, d_dim1, d_offset, e_dim1, e_offset, f_dim1, f_offset, i__1, i__2, i__3, i__4; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ real dsum; diff --git a/lapack-netlib/SRC/ctpcon.c b/lapack-netlib/SRC/ctpcon.c index b918f9b5b6..e7b2791fe5 100644 --- a/lapack-netlib/SRC/ctpcon.c +++ b/lapack-netlib/SRC/ctpcon.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, t_dim1, t_offset, v_dim1, v_offset, work_dim1, work_offset, i__1, i__2, i__3, i__4, i__5; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ logical left, backward; diff --git a/lapack-netlib/SRC/ctprfs.c b/lapack-netlib/SRC/ctprfs.c index fc9112e3e9..86271cddaf 100644 --- a/lapack-netlib/SRC/ctprfs.c +++ b/lapack-netlib/SRC/ctprfs.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer b_dim1, b_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2, r__3, r__4; - complex q__1; + real r__1=0., r__2=0., r__3=0., r__4=0.; + complex q__1={0.,0.}; /* Local variables */ integer kase; diff --git a/lapack-netlib/SRC/ctptri.c b/lapack-netlib/SRC/ctptri.c index 8f7548cc65..5fd910b538 100644 --- a/lapack-netlib/SRC/ctptri.c +++ b/lapack-netlib/SRC/ctptri.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer i__1, i__2; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer j; diff --git a/lapack-netlib/SRC/ctptrs.c b/lapack-netlib/SRC/ctptrs.c index 6aa7c04b7c..b3c3082fbb 100644 --- a/lapack-netlib/SRC/ctptrs.c +++ b/lapack-netlib/SRC/ctptrs.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer i__1, i__2, i__3, i__4; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer i__, j, k; diff --git a/lapack-netlib/SRC/ctpttr.c b/lapack-netlib/SRC/ctpttr.c index 6ddcb6297e..a734f65b84 100644 --- a/lapack-netlib/SRC/ctpttr.c +++ b/lapack-netlib/SRC/ctpttr.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer t_dim1, t_offset, vl_dim1, vl_offset, vr_dim1, vr_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2, r__3; - complex q__1, q__2; + real r__1=0., r__2=0., r__3=0.; + complex q__1={0.,0.}, q__2={0.,0.}; /* Local variables */ logical allv; diff --git a/lapack-netlib/SRC/ctrevc3.c b/lapack-netlib/SRC/ctrevc3.c index f502687137..eff4f70247 100644 --- a/lapack-netlib/SRC/ctrevc3.c +++ b/lapack-netlib/SRC/ctrevc3.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer q_dim1, q_offset, t_dim1, t_offset, i__1, i__2, i__3; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ complex temp; diff --git a/lapack-netlib/SRC/ctrrfs.c b/lapack-netlib/SRC/ctrrfs.c index df2b362c14..c77e846737 100644 --- a/lapack-netlib/SRC/ctrrfs.c +++ b/lapack-netlib/SRC/ctrrfs.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2, r__3, r__4; - complex q__1; + real r__1=0., r__2=0., r__3=0., r__4=0.; + complex q__1={0.,0.}; /* Local variables */ integer kase; diff --git a/lapack-netlib/SRC/ctrsen.c b/lapack-netlib/SRC/ctrsen.c index 5f6501ad56..58562251f6 100644 --- a/lapack-netlib/SRC/ctrsen.c +++ b/lapack-netlib/SRC/ctrsen.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer t_dim1, t_offset, vl_dim1, vl_offset, vr_dim1, vr_offset, work_dim1, work_offset, i__1, i__2, i__3, i__4, i__5; - real r__1, r__2; - complex q__1; + real r__1=0., r__2=0.; + complex q__1={0.,0.}; /* Local variables */ integer kase, ierr; diff --git a/lapack-netlib/SRC/ctrsyl.c b/lapack-netlib/SRC/ctrsyl.c index 8e5ffad1b7..0a3eb6edaf 100644 --- a/lapack-netlib/SRC/ctrsyl.c +++ b/lapack-netlib/SRC/ctrsyl.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4; - real r__1, r__2; - complex q__1, q__2, q__3, q__4; + real r__1=0., r__2=0.; + complex q__1={0.,0.}, q__2={0.,0.}, q__3={0.,0.}, q__4={0.,0.}; /* Local variables */ real smin; diff --git a/lapack-netlib/SRC/ctrsyl3.c b/lapack-netlib/SRC/ctrsyl3.c index e175c648e0..ec1d8be888 100644 --- a/lapack-netlib/SRC/ctrsyl3.c +++ b/lapack-netlib/SRC/ctrsyl3.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -381,19 +381,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer j; diff --git a/lapack-netlib/SRC/ctrtri.c b/lapack-netlib/SRC/ctrtri.c index ade3ce9c78..1ada10ca67 100644 --- a/lapack-netlib/SRC/ctrtri.c +++ b/lapack-netlib/SRC/ctrtri.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ address a__1[2]; integer a_dim1, a_offset, i__1, i__2, i__3[2], i__4, i__5; - complex q__1; + complex q__1={0.,0.}; char ch__1[2]; /* Local variables */ diff --git a/lapack-netlib/SRC/ctrtrs.c b/lapack-netlib/SRC/ctrtrs.c index c7fdfc3471..a12cfe9ba2 100644 --- a/lapack-netlib/SRC/ctrtrs.c +++ b/lapack-netlib/SRC/ctrtrs.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer np1x2, i__, j, k, l; diff --git a/lapack-netlib/SRC/ctrttp.c b/lapack-netlib/SRC/ctrttp.c index 116a080ec8..c38445dcfb 100644 --- a/lapack-netlib/SRC/ctrttp.c +++ b/lapack-netlib/SRC/ctrttp.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ /* System generated locals */ integer x11_dim1, x11_offset, x12_dim1, x12_offset, x21_dim1, x21_offset, x22_dim1, x22_offset, i__1, i__2, i__3; - real r__1; - complex q__1; + real r__1=0.; + complex q__1={0.,0.}; /* Local variables */ logical colmajor; diff --git a/lapack-netlib/SRC/cunbdb1.c b/lapack-netlib/SRC/cunbdb1.c index 6039b6dd83..3dbf4939c0 100644 --- a/lapack-netlib/SRC/cunbdb1.c +++ b/lapack-netlib/SRC/cunbdb1.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer i__, j, l; diff --git a/lapack-netlib/SRC/cung2r.c b/lapack-netlib/SRC/cung2r.c index d6365f0f29..4ac1eeac16 100644 --- a/lapack-netlib/SRC/cung2r.c +++ b/lapack-netlib/SRC/cung2r.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer i__, j, l; diff --git a/lapack-netlib/SRC/cungbr.c b/lapack-netlib/SRC/cungbr.c index 0b30a54470..0a3bb470e6 100644 --- a/lapack-netlib/SRC/cungbr.c +++ b/lapack-netlib/SRC/cungbr.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3; - complex q__1, q__2; + complex q__1={0.,0.}, q__2={0.,0.}; /* Local variables */ integer i__, j, l; diff --git a/lapack-netlib/SRC/cunglq.c b/lapack-netlib/SRC/cunglq.c index c36b3abf87..982aaaeb9b 100644 --- a/lapack-netlib/SRC/cunglq.c +++ b/lapack-netlib/SRC/cunglq.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3; - complex q__1, q__2; + complex q__1={0.,0.}, q__2={0.,0.}; /* Local variables */ integer i__, j, l; diff --git a/lapack-netlib/SRC/cungrq.c b/lapack-netlib/SRC/cungrq.c index 220d1120f7..fd495df659 100644 --- a/lapack-netlib/SRC/cungrq.c +++ b/lapack-netlib/SRC/cungrq.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, t_dim1, t_offset, i__1, i__2; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ extern /* Subroutine */ void clamtsqr_(char *, char *, integer *, integer * diff --git a/lapack-netlib/SRC/cungtsqr_row.c b/lapack-netlib/SRC/cungtsqr_row.c index f91f27ff1d..1c4bcf959e 100644 --- a/lapack-netlib/SRC/cungtsqr_row.c +++ b/lapack-netlib/SRC/cungtsqr_row.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, t_dim1, t_offset, i__1, i__2, i__3, i__4, i__5; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ integer jb_t__, itmp, lworkopt; diff --git a/lapack-netlib/SRC/cunhr_col.c b/lapack-netlib/SRC/cunhr_col.c index 1ae2f7f440..d5220f95b4 100644 --- a/lapack-netlib/SRC/cunhr_col.c +++ b/lapack-netlib/SRC/cunhr_col.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, t_dim1, t_offset, i__1, i__2, i__3, i__4, i__5; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ extern /* Subroutine */ void claunhr_col_getrfnp_(integer *, integer *, diff --git a/lapack-netlib/SRC/cunm22.c b/lapack-netlib/SRC/cunm22.c index ff6ded41fa..262b7ae46d 100644 --- a/lapack-netlib/SRC/cunm22.c +++ b/lapack-netlib/SRC/cunm22.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer q_dim1, q_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ logical left; diff --git a/lapack-netlib/SRC/cunm2l.c b/lapack-netlib/SRC/cunm2l.c index b4c9fc4288..8c03c78fdc 100644 --- a/lapack-netlib/SRC/cunm2l.c +++ b/lapack-netlib/SRC/cunm2l.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ logical left; diff --git a/lapack-netlib/SRC/cunm2r.c b/lapack-netlib/SRC/cunm2r.c index 184d488b5c..89a21dffca 100644 --- a/lapack-netlib/SRC/cunm2r.c +++ b/lapack-netlib/SRC/cunm2r.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ logical left; diff --git a/lapack-netlib/SRC/cunmbr.c b/lapack-netlib/SRC/cunmbr.c index 9cb9537367..84e0131bfe 100644 --- a/lapack-netlib/SRC/cunmbr.c +++ b/lapack-netlib/SRC/cunmbr.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ logical left; diff --git a/lapack-netlib/SRC/cunmlq.c b/lapack-netlib/SRC/cunmlq.c index 93c9f14d4c..a1a6f8369b 100644 --- a/lapack-netlib/SRC/cunmlq.c +++ b/lapack-netlib/SRC/cunmlq.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ logical left; diff --git a/lapack-netlib/SRC/cunmr3.c b/lapack-netlib/SRC/cunmr3.c index 69026f0588..03f16735e7 100644 --- a/lapack-netlib/SRC/cunmr3.c +++ b/lapack-netlib/SRC/cunmr3.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ logical left; diff --git a/lapack-netlib/SRC/cunmrq.c b/lapack-netlib/SRC/cunmrq.c index c981c78700..16af8d011e 100644 --- a/lapack-netlib/SRC/cunmrq.c +++ b/lapack-netlib/SRC/cunmrq.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i */ { /* System generated locals */ integer c_dim1, c_offset, i__1, i__2, i__3; - complex q__1; + complex q__1={0.,0.}; /* Local variables */ logical left; diff --git a/lapack-netlib/SRC/dgeev.f b/lapack-netlib/SRC/dgeev.f index fc73bb226b..4677b9f520 100644 --- a/lapack-netlib/SRC/dgeev.f +++ b/lapack-netlib/SRC/dgeev.f @@ -506,17 +506,17 @@ SUBROUTINE DGEEV( JOBVL, JOBVR, N, A, LDA, WR, WI, VL, LDVL, VR, * Undo scaling if necessary * 50 CONTINUE - IF( SCALEA .AND. INFO.GT.0) THEN + IF( SCALEA ) THEN CALL DLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, WR( INFO+1 ), $ MAX( N-INFO, 1 ), IERR ) CALL DLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, WI( INFO+1 ), $ MAX( N-INFO, 1 ), IERR ) - + IF( INFO.GT.0 ) THEN CALL DLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, WR, N, $ IERR ) CALL DLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, WI, N, $ IERR ) - + END IF END IF * WORK( 1 ) = MAXWRK diff --git a/lapack-netlib/SRC/dlaed2.f b/lapack-netlib/SRC/dlaed2.f index 1a53650e8d..42f771087d 100644 --- a/lapack-netlib/SRC/dlaed2.f +++ b/lapack-netlib/SRC/dlaed2.f @@ -75,7 +75,7 @@ *> On entry, D contains the eigenvalues of the two submatrices to *> be combined. *> On exit, D contains the trailing (N-K) updated eigenvalues -*> (those which were deflated) sorted into increasing order. +*> (those which were deflated) sorted into decreasing order. *> \endverbatim *> *> \param[in,out] Q diff --git a/lapack-netlib/SRC/dlaed8.f b/lapack-netlib/SRC/dlaed8.f index 5d1d9144d1..6a0e7ec7fe 100644 --- a/lapack-netlib/SRC/dlaed8.f +++ b/lapack-netlib/SRC/dlaed8.f @@ -85,7 +85,7 @@ *> D is DOUBLE PRECISION array, dimension (N) *> On entry, the eigenvalues of the two submatrices to be *> combined. On exit, the trailing (N-K) updated eigenvalues -*> (those which were deflated) sorted into increasing order. +*> (those which were deflated) sorted into decreasing order. *> \endverbatim *> *> \param[in,out] Q diff --git a/lapack-netlib/SRC/dlasd2.f b/lapack-netlib/SRC/dlasd2.f index 378faa6813..33724164f3 100644 --- a/lapack-netlib/SRC/dlasd2.f +++ b/lapack-netlib/SRC/dlasd2.f @@ -88,7 +88,7 @@ *> On entry D contains the singular values of the two submatrices *> to be combined. On exit D contains the trailing (N-K) updated *> singular values (those which were deflated) sorted into -*> increasing order. +*> decreasing order. *> \endverbatim *> *> \param[out] Z @@ -219,7 +219,7 @@ *> IDXQ is INTEGER array, dimension(N) *> This contains the permutation which separately sorts the two *> sub-problems in D into ascending order. Note that entries in -*> the first hlaf of this permutation must first be moved one +*> the first half of this permutation must first be moved one *> position backward; and entries in the second half *> must first have NL+1 added to their values. *> \endverbatim @@ -451,7 +451,7 @@ SUBROUTINE DLASD2( NL, NR, SQRE, K, D, Z, ALPHA, BETA, U, LDU, VT, * * Check if singular values are close enough to allow deflation. * - IF( ABS( D( J )-D( JPREV ) ).LE.TOL ) THEN + IF( ( D( J )-D( JPREV ) ).LE.TOL ) THEN * * Deflation is possible. * @@ -486,7 +486,14 @@ SUBROUTINE DLASD2( NL, NR, SQRE, K, D, Z, ALPHA, BETA, U, LDU, VT, END IF COLTYP( JPREV ) = 4 K2 = K2 - 1 - IDXP( K2 ) = JPREV +* +* Insert the deflated index in the correct position in IDXP. +* If J - JPREV is greater than 1, the indices in between +* must be shifted to preserve the correct output order. +* + DO 105 JP = JPREV, J - 1 + IDXP( K2 + J - 1 - JP ) = JP + 105 CONTINUE JPREV = J ELSE K = K + 1 diff --git a/lapack-netlib/SRC/dlasd7.f b/lapack-netlib/SRC/dlasd7.f index ff9ba4c36a..84dc419594 100644 --- a/lapack-netlib/SRC/dlasd7.f +++ b/lapack-netlib/SRC/dlasd7.f @@ -101,7 +101,7 @@ *> On entry D contains the singular values of the two submatrices *> to be combined. On exit D contains the trailing (N-K) updated *> singular values (those which were deflated) sorted into -*> increasing order. +*> decreasing order. *> \endverbatim *> *> \param[out] Z @@ -454,7 +454,7 @@ SUBROUTINE DLASD7( ICOMPQ, NL, NR, SQRE, K, D, Z, ZW, VF, VFW, VL, * * Check if singular values are close enough to allow deflation. * - IF( ABS( D( J )-D( JPREV ) ).LE.TOL ) THEN + IF( ( D( J )-D( JPREV ) ).LE.TOL ) THEN * * Deflation is possible. * @@ -490,7 +490,14 @@ SUBROUTINE DLASD7( ICOMPQ, NL, NR, SQRE, K, D, Z, ZW, VF, VFW, VL, CALL DROT( 1, VF( JPREV ), 1, VF( J ), 1, C, S ) CALL DROT( 1, VL( JPREV ), 1, VL( J ), 1, C, S ) K2 = K2 - 1 - IDXP( K2 ) = JPREV +* +* Insert the deflated index in the correct position in IDXP. +* If J - JPREV is greater than 1, the indices in between +* must be shifted to preserve the correct output order. +* + DO 85 JP = JPREV, J - 1 + IDXP( K2 + J - 1 - JP ) = JP + 85 CONTINUE JPREV = J ELSE K = K + 1 diff --git a/lapack-netlib/SRC/iparmq.c b/lapack-netlib/SRC/iparmq.c index 57578929fe..dbecda2050 100644 --- a/lapack-netlib/SRC/iparmq.c +++ b/lapack-netlib/SRC/iparmq.c @@ -745,7 +745,7 @@ integer iparmq_(integer *ispec, char *name__, char *opts, integer *n, integer /* Local variables */ integer i__, ic, nh, ns, iz; char subnam[6]; - integer name_len; + integer name_len=0; /* -- LAPACK auxiliary routine (version 3.7.1) -- */ /* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ diff --git a/lapack-netlib/SRC/lsamen.c b/lapack-netlib/SRC/lsamen.c index 252fed8dc4..84d12c7ae5 100644 --- a/lapack-netlib/SRC/lsamen.c +++ b/lapack-netlib/SRC/lsamen.c @@ -594,7 +594,7 @@ logical lsamen_(integer *n, char *ca, char *cb) /* Local variables */ integer i__; extern logical lsame_(char *, char *); - integer ca_len,cb_len; + integer ca_len=0,cb_len=0; /* -- LAPACK auxiliary routine (version 3.7.0) -- */ /* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ diff --git a/lapack-netlib/SRC/sgeev.f b/lapack-netlib/SRC/sgeev.f index adf1a1a9de..93f9932651 100644 --- a/lapack-netlib/SRC/sgeev.f +++ b/lapack-netlib/SRC/sgeev.f @@ -504,17 +504,17 @@ SUBROUTINE SGEEV( JOBVL, JOBVR, N, A, LDA, WR, WI, VL, LDVL, VR, * Undo scaling if necessary * 50 CONTINUE - IF( SCALEA .AND. INFO.GT.0) THEN + IF( SCALEA ) THEN CALL SLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, WR( INFO+1 ), $ MAX( N-INFO, 1 ), IERR ) CALL SLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, WI( INFO+1 ), $ MAX( N-INFO, 1 ), IERR ) - + IF( INFO.GT.0 ) THEN CALL SLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, WR, N, $ IERR ) CALL SLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, WI, N, $ IERR ) - + END IF END IF * WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) diff --git a/lapack-netlib/SRC/slaed2.f b/lapack-netlib/SRC/slaed2.f index cadf535555..b6be3b9c78 100644 --- a/lapack-netlib/SRC/slaed2.f +++ b/lapack-netlib/SRC/slaed2.f @@ -75,7 +75,7 @@ *> On entry, D contains the eigenvalues of the two submatrices to *> be combined. *> On exit, D contains the trailing (N-K) updated eigenvalues -*> (those which were deflated) sorted into increasing order. +*> (those which were deflated) sorted into decreasing order. *> \endverbatim *> *> \param[in,out] Q diff --git a/lapack-netlib/SRC/slaed8.f b/lapack-netlib/SRC/slaed8.f index 9dd8a15f60..2749f9bf41 100644 --- a/lapack-netlib/SRC/slaed8.f +++ b/lapack-netlib/SRC/slaed8.f @@ -85,7 +85,7 @@ *> D is REAL array, dimension (N) *> On entry, the eigenvalues of the two submatrices to be *> combined. On exit, the trailing (N-K) updated eigenvalues -*> (those which were deflated) sorted into increasing order. +*> (those which were deflated) sorted into decreasing order. *> \endverbatim *> *> \param[in,out] Q diff --git a/lapack-netlib/SRC/slasd2.f b/lapack-netlib/SRC/slasd2.f index 1902242c77..d6cd24c1f3 100644 --- a/lapack-netlib/SRC/slasd2.f +++ b/lapack-netlib/SRC/slasd2.f @@ -88,7 +88,7 @@ *> On entry D contains the singular values of the two submatrices *> to be combined. On exit D contains the trailing (N-K) updated *> singular values (those which were deflated) sorted into -*> increasing order. +*> decreasing order. *> \endverbatim *> *> \param[out] Z @@ -219,7 +219,7 @@ *> IDXQ is INTEGER array, dimension (N) *> This contains the permutation which separately sorts the two *> sub-problems in D into ascending order. Note that entries in -*> the first hlaf of this permutation must first be moved one +*> the first half of this permutation must first be moved one *> position backward; and entries in the second half *> must first have NL+1 added to their values. *> \endverbatim @@ -451,7 +451,7 @@ SUBROUTINE SLASD2( NL, NR, SQRE, K, D, Z, ALPHA, BETA, U, LDU, VT, * * Check if singular values are close enough to allow deflation. * - IF( ABS( D( J )-D( JPREV ) ).LE.TOL ) THEN + IF( ( D( J )-D( JPREV ) ).LE.TOL ) THEN * * Deflation is possible. * @@ -486,7 +486,14 @@ SUBROUTINE SLASD2( NL, NR, SQRE, K, D, Z, ALPHA, BETA, U, LDU, VT, END IF COLTYP( JPREV ) = 4 K2 = K2 - 1 - IDXP( K2 ) = JPREV +* +* Insert the deflated index in the correct position in IDXP. +* If J - JPREV is greater than 1, the indices in between +* must be shifted to preserve the correct output order. +* + DO 105 JP = JPREV, J - 1 + IDXP( K2 + J - 1 - JP ) = JP + 105 CONTINUE JPREV = J ELSE K = K + 1 diff --git a/lapack-netlib/SRC/slasd7.f b/lapack-netlib/SRC/slasd7.f index efd1f59d08..ee37f7373b 100644 --- a/lapack-netlib/SRC/slasd7.f +++ b/lapack-netlib/SRC/slasd7.f @@ -101,7 +101,7 @@ *> On entry D contains the singular values of the two submatrices *> to be combined. On exit D contains the trailing (N-K) updated *> singular values (those which were deflated) sorted into -*> increasing order. +*> decreasing order. *> \endverbatim *> *> \param[out] Z @@ -454,7 +454,7 @@ SUBROUTINE SLASD7( ICOMPQ, NL, NR, SQRE, K, D, Z, ZW, VF, VFW, VL, * * Check if singular values are close enough to allow deflation. * - IF( ABS( D( J )-D( JPREV ) ).LE.TOL ) THEN + IF( ( D( J )-D( JPREV ) ).LE.TOL ) THEN * * Deflation is possible. * @@ -490,7 +490,14 @@ SUBROUTINE SLASD7( ICOMPQ, NL, NR, SQRE, K, D, Z, ZW, VF, VFW, VL, CALL SROT( 1, VF( JPREV ), 1, VF( J ), 1, C, S ) CALL SROT( 1, VL( JPREV ), 1, VL( J ), 1, C, S ) K2 = K2 - 1 - IDXP( K2 ) = JPREV +* +* Insert the deflated index in the correct position in IDXP. +* If J - JPREV is greater than 1, the indices in between +* must be shifted to preserve the correct output order. +* + DO 85 JP = JPREV, J - 1 + IDXP( K2 + J - 1 - JP ) = JP + 85 CONTINUE JPREV = J ELSE K = K + 1 diff --git a/lapack-netlib/SRC/zbbcsd.c b/lapack-netlib/SRC/zbbcsd.c index 1d89db1b28..fbca7fa4da 100644 --- a/lapack-netlib/SRC/zbbcsd.c +++ b/lapack-netlib/SRC/zbbcsd.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i \htmlonly *> Download ZDRSCL + dependencies *> *> [TGZ] @@ -13,7 +12,6 @@ *> [ZIP] *> *> [TXT] -*> \endhtmlonly * * Definition: * =========== @@ -77,10 +75,11 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERauxiliary +*> \ingroup rscl * * ===================================================================== SUBROUTINE ZDRSCL( N, SA, SX, INCX ) + IMPLICIT NONE * * -- LAPACK auxiliary routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -109,10 +108,11 @@ SUBROUTINE ZDRSCL( N, SA, SX, INCX ) EXTERNAL DLAMCH * .. * .. External Subroutines .. - EXTERNAL DLABAD, ZDSCAL + EXTERNAL ZDSCAL * .. * .. Intrinsic Functions .. INTRINSIC ABS + INTRINSIC HUGE * .. * .. Executable Statements .. * @@ -120,12 +120,16 @@ SUBROUTINE ZDRSCL( N, SA, SX, INCX ) * IF( N.LE.0 ) $ RETURN +* + IF( SA.GT.HUGE(SA) .OR. SA.LT.-HUGE(SA) ) THEN + CALL ZDSCAL( N, SA, SX, INCX ) + RETURN + END IF * * Get machine parameters * SMLNUM = DLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) * * Initialize the denominator to SA and the numerator to 1. * diff --git a/lapack-netlib/SRC/zgbbrd.c b/lapack-netlib/SRC/zgbbrd.c index 21e247b933..67a54e3327 100644 --- a/lapack-netlib/SRC/zgbbrd.c +++ b/lapack-netlib/SRC/zgbbrd.c @@ -190,8 +190,8 @@ typedef struct Namelist Namelist; #define c_abs(z) (cabsf(Cf(z))) #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER -#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define c_div(c, a, b) {float nenn=crealf(_FCmulcc(Cf(b),conjf(Cf(b)))); _Fcomplex zaehl=_FCmulcc(Cf(a),conjf(Cf(b))); pCf(c)=_FCbuild(crealf(zaehl)/nenn,cimagf(zaehl)/nenn);} +#define z_div(c, a, b) {double nenn=creal(_Cmulcc(Cd(b),conj(Cd(b)))); _Dcomplex zaehl=_Cmulcc(Cd(a),conj(Cd(b))); pCd(c)=_Cbuild(creal(zaehl)/nenn,cimag(zaehl)/nenn);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -378,19 +378,24 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n) if (w[i-1]>m) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;im) mi=i ,m=w[i-1]; return mi-s+1; } + static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { integer n = *n_, incx = *incx_, incy = *incy_, i; #ifdef _MSC_VER _Fcomplex zdotc = {0.0, 0.0}; if (incx == 1 && incy == 1) { for (i=0;i +#include "common.h" + +#define max(a,b) ((a) > (b) ? (a) : (b)) +#define copysign(x,y) ((y) < 0 ? ((x) < 0 ? (x) : -(x)) : ((x) < 0 ? -(x) : (x))) + +#if defined(DOUBLE) +#define LAMC3 BLASFUNC(dlamc3) +#define LAED4 BLASFUNC(dlaed4) +#define GEMM BLASFUNC(dgemm) +#define NRM2 BLASFUNC(dnrm2) +#define COPY BLASFUNC(dcopy) +#define LACPY BLASFUNC(dlacpy) +#define LASET BLASFUNC(dlaset) +#else +#define LAMC3 BLASFUNC(slamc3) +#define LAED4 BLASFUNC(slaed4) +#define GEMM BLASFUNC(sgemm) +#define NRM2 BLASFUNC(snrm2) +#define COPY BLASFUNC(scopy) +#define LACPY BLASFUNC(slacpy) +#define LASET BLASFUNC(slaset) +#endif + +FLOAT LAMC3(FLOAT *, FLOAT *); +void LAED4(blasint *, blasint *, FLOAT *, FLOAT *, FLOAT *, FLOAT *, FLOAT *, blasint *); +void LACPY(char *, blasint *, blasint *, FLOAT *, blasint *, FLOAT *, blasint *); +void LASET(char *, blasint *, blasint *, FLOAT *, FLOAT *, FLOAT *, blasint *); + +/* Table of constant values */ +static blasint c1 = 1; +static FLOAT c1f = 1.; +static FLOAT c0f = 0.; + +static void inner_laed4_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + blasint kval = args -> m; + blasint j, j_from, j_to; + FLOAT *dlamda = (FLOAT *)args -> a; + FLOAT *w = (FLOAT *)args -> b; + FLOAT *q = (FLOAT *)args -> c; + BLASLONG qdim = args -> ldc; + FLOAT *d = (FLOAT *)args -> d; + FLOAT rho = *(FLOAT *)args -> alpha; + blasint *info = &((blasint*)args -> beta)[mypos]; + + j_from = range_m[0] + 1; + j_to = range_m[1]; + + for (j = j_from; j <= j_to; j++) { + LAED4(&kval, &j, dlamda, w, &q[(j - 1) * qdim], &rho, &d[j - 1], info); + if(*info != 0) break; + } +} + +static void inner_wloop_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + blasint kval = args -> m; + blasint i, j, i_from, i_to; + FLOAT *dlamda = (FLOAT *)args -> a; + FLOAT *w = (FLOAT *)args -> b; + FLOAT *q = (FLOAT *)args -> c; + BLASLONG qdim = args -> ldc; + i_from = range_m[0]; + i_to = range_m[1]; + for (j = 0; j < kval; j++) { + for (i = i_from; i < i_to; i++) { + if (i != j) w[i] *= q[j * qdim + i] / (dlamda[i] - dlamda[j]); + } + } +} + +/* ===================================================================== */ +blasint CNAME(blasint *k, blasint *n, blasint *n1, FLOAT *d, + FLOAT *q, blasint *ldq, FLOAT *rho, FLOAT *dlamda, + FLOAT *q2, blasint *indx, blasint *ctot, FLOAT *w, + FLOAT *s, blasint *info) +{ + FLOAT temp; + blasint kval, qdim; + blasint i, j, itmp; + blasint n2, n12, ii, n23, iq2; + blas_queue_t queue[MAX_CPU_NUMBER]; + blas_arg_t args; + BLASLONG range[MAX_CPU_NUMBER + 1]; + blasint infoarray[MAX_CPU_NUMBER]; + int width, num_cpu, mode, nthreads; + + qdim = *ldq; + kval = *k; + +/* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can */ +/* be computed with high relative accuracy (barring over/underflow). */ + + for (i = 0; i < kval; i++) { + dlamda[i] = LAMC3(&dlamda[i], &dlamda[i]) - dlamda[i]; + } + + nthreads = num_cpu_avail(4); + +#if defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif + args.m = kval; + args.a = (void *)dlamda; + args.b = (void *)w; + args.c = (void *)q; + args.ldc = qdim; + args.d = (void *)d; + args.alpha = (void *)rho; + args.beta = (void *)infoarray; + num_cpu = 0; + range[0] = 0; + i = kval; + while (i > 0) { + width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); + range[num_cpu + 1] = range[num_cpu] + width; + queue[num_cpu].range_m = &range[num_cpu]; + queue[num_cpu].range_n = NULL; + queue[num_cpu].routine = inner_laed4_thread; + queue[num_cpu].args = &args; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].mode = mode; + queue[num_cpu].next = &queue[num_cpu + 1]; + infoarray[num_cpu] = 0; + num_cpu ++; + i -= width; + } + if (num_cpu) { + queue[num_cpu - 1].next = NULL; + exec_blas(num_cpu, queue); + } + for (i = 0; i < num_cpu; i++) { + *info = max(infoarray[i], *info); + } + +/* If the zero finder fails, the computation is terminated. */ + + if (*info != 0) { + return 0; + } + + if (kval == 2) { + for (j = 0; j < kval; j++) { + w[0] = q[j * qdim]; + w[1] = q[j * qdim + 1]; + ii = indx[0] - 1; + q[j * qdim] = w[ii]; + ii = indx[1] - 1; + q[j * qdim + 1] = w[ii]; + } + } else if (kval != 1) { + +/* Compute updated W. */ + + COPY(k, w, &c1, s, &c1); + +/* Initialize W(I) = Q(I,I) */ + + itmp = qdim + 1; + COPY(k, q, &itmp, w, &c1); + + for (i = 0; i < num_cpu; i++) { + queue[i].routine = inner_wloop_thread; + } + if (num_cpu) { + exec_blas(num_cpu, queue); + } + for (i = 0; i < kval; i++) { + temp = sqrt(-w[i]); + w[i] = copysign(temp, s[i]); + } + +/* Compute eigenvectors of the modified rank-1 modification. */ + + for (j = 0; j < kval; j++) { + for (i = 0; i < kval; i++) { + s[i] = w[i] / q[j * qdim + i]; + } + temp = NRM2(k, s, &c1); + for (i = 0; i < kval; i++) { + ii = indx[i] - 1; + q[j * qdim + i] = s[ii] / temp; + } + } + } + +/* Compute the updated eigenvectors. */ + + n2 = *n - *n1; + n12 = ctot[0] + ctot[1]; + n23 = ctot[1] + ctot[2]; + + LACPY("A", &n23, k, &q[ctot[0]], ldq, s, &n23); + iq2 = *n1 * n12; + if (n23 != 0) { + GEMM("N", "N", &n2, k, &n23, &c1f, &q2[iq2], &n2, s, &n23, &c0f, &q[*n1], ldq); + } else { + LASET("A", &n2, k, &c0f, &c0f, &q[*n1], ldq); + } + + LACPY("A", &n12, k, q, ldq, s, &n12); + if (n12 != 0) { + GEMM("N", "N", n1, k, &n12, &c1f, q2, n1, s, &n12, &c0f, q, ldq); + } else { + LASET("A", n1, k, &c0f, &c0f, q, ldq); + } + + return 0; +} diff --git a/lapack/laed3/laed3_single.c b/lapack/laed3/laed3_single.c new file mode 100644 index 0000000000..b21bb99300 --- /dev/null +++ b/lapack/laed3/laed3_single.c @@ -0,0 +1,166 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#define copysign(x,y) ((y) < 0 ? ((x) < 0 ? (x) : -(x)) : ((x) < 0 ? -(x) : (x))) + +#if defined(DOUBLE) +#define LAMC3 BLASFUNC(dlamc3) +#define LAED4 BLASFUNC(dlaed4) +#define GEMM BLASFUNC(dgemm) +#define NRM2 BLASFUNC(dnrm2) +#define COPY BLASFUNC(dcopy) +#define LACPY BLASFUNC(dlacpy) +#define LASET BLASFUNC(dlaset) +#else +#define LAMC3 BLASFUNC(slamc3) +#define LAED4 BLASFUNC(slaed4) +#define GEMM BLASFUNC(sgemm) +#define NRM2 BLASFUNC(snrm2) +#define COPY BLASFUNC(scopy) +#define LACPY BLASFUNC(slacpy) +#define LASET BLASFUNC(slaset) +#endif + +FLOAT LAMC3(FLOAT *, FLOAT *); +void LAED4(blasint *, blasint *, FLOAT *, FLOAT *, FLOAT *, FLOAT *, FLOAT *, blasint *); +void LACPY(char *, blasint *, blasint *, FLOAT *, blasint *, FLOAT *, blasint *); +void LASET(char *, blasint *, blasint *, FLOAT *, FLOAT *, FLOAT *, blasint *); + +/* Table of constant values */ +static blasint c1 = 1; +static FLOAT c1f = 1.; +static FLOAT c0f = 0.; + +/* ===================================================================== */ +blasint CNAME(blasint *k, blasint *n, blasint *n1, FLOAT *d, + FLOAT *q, blasint *ldq, FLOAT *rho, FLOAT *dlamda, + FLOAT *q2, blasint *indx, blasint *ctot, FLOAT *w, + FLOAT *s, blasint *info) +{ + FLOAT temp; + blasint kval, qdim; + blasint i, j, itmp; + blasint n2, n12, ii, n23, iq2; + + qdim = *ldq; + kval = *k; + +/* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can */ +/* be computed with high relative accuracy (barring over/underflow). */ + + for (i = 0; i < kval; i++) { + dlamda[i] = LAMC3(&dlamda[i], &dlamda[i]) - dlamda[i]; + } + + for (j = 1; j <= kval; j++) { + LAED4(k, &j, dlamda, w, &q[(j - 1) * qdim], rho, &d[j - 1], info); + if(*info != 0) break; + } + +/* If the zero finder fails, the computation is terminated. */ + + if (*info != 0) { + return 0; + } + + if (kval == 2) { + for (j = 0; j < kval; j++) { + w[0] = q[j * qdim]; + w[1] = q[j * qdim + 1]; + ii = indx[0] - 1; + q[j * qdim] = w[ii]; + ii = indx[1] - 1; + q[j * qdim + 1] = w[ii]; + } + } else if (kval != 1) { + +/* Compute updated W. */ + + COPY(k, w, &c1, s, &c1); + +/* Initialize W(I) = Q(I,I) */ + + itmp = qdim + 1; + COPY(k, q, &itmp, w, &c1); + for (j = 0; j < kval; j++) { + for (i = 0; i < j; i++) { + w[i] *= q[j * qdim + i] / (dlamda[i] - dlamda[j]); + } + for (i = j + 1; i < kval; i++) { + w[i] *= q[j * qdim + i] / (dlamda[i] - dlamda[j]); + } + } + for (i = 0; i < kval; i++) { + temp = sqrt(-w[i]); + w[i] = copysign(temp, s[i]); + } + +/* Compute eigenvectors of the modified rank-1 modification. */ + + for (j = 0; j < kval; j++) { + for (i = 0; i < kval; i++) { + s[i] = w[i] / q[j * qdim + i]; + } + temp = NRM2(k, s, &c1); + for (i = 0; i < kval; i++) { + ii = indx[i] - 1; + q[j * qdim + i] = s[ii] / temp; + } + } + } + +/* Compute the updated eigenvectors. */ + + n2 = *n - *n1; + n12 = ctot[0] + ctot[1]; + n23 = ctot[1] + ctot[2]; + + LACPY("A", &n23, k, &q[ctot[0]], ldq, s, &n23); + iq2 = *n1 * n12; + if (n23 != 0) { + GEMM("N", "N", &n2, k, &n23, &c1f, &q2[iq2], &n2, s, &n23, &c0f, &q[*n1], ldq); + } else { + LASET("A", &n2, k, &c0f, &c0f, &q[*n1], ldq); + } + + LACPY("A", &n12, k, q, ldq, s, &n12); + if (n12 != 0) { + GEMM("N", "N", n1, k, &n12, &c1f, q2, n1, s, &n12, &c0f, q, ldq); + } else { + LASET("A", n1, k, &c0f, &c0f, q, ldq); + } + + return 0; +} diff --git a/lapack/potrf/potrf_parallel.c b/lapack/potrf/potrf_parallel.c index c38a2632d5..1e512848b8 100644 --- a/lapack/potrf/potrf_parallel.c +++ b/lapack/potrf/potrf_parallel.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2025 The OpenBLAS Project. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -405,8 +406,8 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; -#elif defined(HALF) - mode = BLAS_HALF | BLAS_REAL; +#elif defined(BFLOAT16) + mode = BLAS_BFLOAT16 | BLAS_REAL; mask = MAX(SBGEMM_UNROLL_M, SBGEMM_UNROLL_N) - 1; #else mode = BLAS_SINGLE | BLAS_REAL; diff --git a/openblas_config_template.h b/openblas_config_template.h index 6a73821081..e6637f788e 100644 --- a/openblas_config_template.h +++ b/openblas_config_template.h @@ -39,6 +39,13 @@ typedef unsigned long BLASULONG; typedef uint16_t bfloat16; #endif +#if defined(__GNUC__) && (__GNUC__ >= 12) +typedef _Float16 hfloat16; +#else +#include +typedef uint16_t hfloat16; +#endif + #ifdef OPENBLAS_USE64BITINT typedef BLASLONG blasint; #else diff --git a/param.h b/param.h index 48b64fd2ae..8112e8915a 100644 --- a/param.h +++ b/param.h @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011-2023, The OpenBLAS Project +Copyright (c) 2011-2023, 2025 The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -72,6 +72,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef PARAM_H #define PARAM_H +#define SHGEMM_DEFAULT_UNROLL_N 8 +#define SHGEMM_DEFAULT_UNROLL_M 8 +#define SHGEMM_DEFAULT_UNROLL_MN 32 +#define SHGEMM_DEFAULT_P 128 +#define SHGEMM_DEFAULT_R 240 +#define SHGEMM_DEFAULT_Q 12288 + +#define BGEMM_DEFAULT_UNROLL_N 4 +#define BGEMM_DEFAULT_UNROLL_M 8 +#define BGEMM_DEFAULT_UNROLL_MN 32 +#define BGEMM_DEFAULT_P 256 +#define BGEMM_DEFAULT_R 256 +#define BGEMM_DEFAULT_Q 256 +#define BGEMM_ALIGN_K 1 // must be 2^x #define SBGEMM_DEFAULT_UNROLL_N 4 #define SBGEMM_DEFAULT_UNROLL_M 8 @@ -1766,7 +1780,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SYMV_P 8 @@ -1899,7 +1913,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SYMV_P 8 @@ -3138,10 +3152,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef RISCV64_ZVL128B + #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL +#undef SHGEMM_DEFAULT_UNROLL_M +#undef SHGEMM_DEFAULT_UNROLL_N +#define SHGEMM_DEFAULT_UNROLL_M 8 +#define SHGEMM_DEFAULT_UNROLL_N 8 + +#undef SBGEMM_DEFAULT_UNROLL_M +#undef SBGEMM_DEFAULT_UNROLL_N +#define SBGEMM_DEFAULT_UNROLL_M 8 +#define SBGEMM_DEFAULT_UNROLL_N 8 + #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 8 @@ -3154,16 +3179,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 +#undef SHGEMM_DEFAULT_P +#define SHGEMM_DEFAULT_P 128 +#undef SBGEMM_DEFAULT_P +#define SBGEMM_DEFAULT_P 128 #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 96 #define ZGEMM_DEFAULT_P 64 +#undef SHGEMM_DEFAULT_Q +#define SHGEMM_DEFAULT_Q 240 +#undef SBGEMM_DEFAULT_Q +#define SBGEMM_DEFAULT_Q 240 #define SGEMM_DEFAULT_Q 240 #define DGEMM_DEFAULT_Q 120 #define CGEMM_DEFAULT_Q 120 #define ZGEMM_DEFAULT_Q 120 +#undef SHGEMM_DEFAULT_R +#define SHGEMM_DEFAULT_R 12288 +#undef SBGEMM_DEFAULT_R +#define SBGEMM_DEFAULT_R 12288 #define SGEMM_DEFAULT_R 12288 #define DGEMM_DEFAULT_R 8192 #define CGEMM_DEFAULT_R 4096 @@ -3181,6 +3218,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL +#undef SHGEMM_DEFAULT_UNROLL_M +#undef SHGEMM_DEFAULT_UNROLL_N +#define SHGEMM_DEFAULT_UNROLL_M 16 +#define SHGEMM_DEFAULT_UNROLL_N 8 + +#undef SBGEMM_DEFAULT_UNROLL_M +#undef SBGEMM_DEFAULT_UNROLL_N +#define SBGEMM_DEFAULT_UNROLL_M 16 +#define SBGEMM_DEFAULT_UNROLL_N 8 + #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 @@ -3193,16 +3240,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 4 +#undef SHGEMM_DEFAULT_P +#define SHGEMM_DEFAULT_P 128 +#undef SBGEMM_DEFAULT_P +#define SBGEMM_DEFAULT_P 128 #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 64 #define CGEMM_DEFAULT_P 64 #define ZGEMM_DEFAULT_P 64 +#undef SHGEMM_DEFAULT_Q +#define SHGEMM_DEFAULT_Q 128 +#undef SBGEMM_DEFAULT_Q +#define SBGEMM_DEFAULT_Q 128 #define SGEMM_DEFAULT_Q 128 #define DGEMM_DEFAULT_Q 128 #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 64 +#undef SHGEMM_DEFAULT_R +#define SHGEMM_DEFAULT_R 16384 +#undef SBGEMM_DEFAULT_R +#define SBGEMM_DEFAULT_R 16384 #define SGEMM_DEFAULT_R 16384 #define DGEMM_DEFAULT_R 8192 #define CGEMM_DEFAULT_R 8192 @@ -3316,7 +3375,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(CORTEXA57) || defined(CORTEXX1) || \ defined(CORTEXA72) || defined(CORTEXA73) || \ - defined(FALKOR) || defined(TSV110) || defined(EMAG8180) || defined(VORTEX) || defined(FT2000) + defined(FALKOR) || defined(TSV110) || defined(EMAG8180) || defined(VORTEX) || defined(FT2000) || defined(VORTEXM4) #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -3333,7 +3392,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*FIXME: this should be using the cache size, but there is currently no easy way to query that on ARM. So if getarch counted more than 8 cores we simply assume the host is a big desktop or server with abundant cache rather than a phone or embedded device */ -#if NUM_CORES > 8 || defined(TSV110) || defined(EMAG8180) || defined(VORTEX)|| defined(CORTEXX1) +#if NUM_CORES > 8 || defined(TSV110) || defined(EMAG8180) || defined(VORTEX)|| defined(CORTEXX1) || defined(VORTEXM4) #define SGEMM_DEFAULT_P 512 #define DGEMM_DEFAULT_P 256 #define CGEMM_DEFAULT_P 256 @@ -3548,6 +3607,8 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #elif defined(NEOVERSEV1) // 256-bit SVE +#define GEMM_DIVIDE_LIMIT 3 + #if defined(XDOUBLE) || defined(DOUBLE) #define SWITCH_RATIO 8 #define GEMM_PREFERED_SIZE 4 @@ -3556,11 +3617,18 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define GEMM_PREFERED_SIZE 8 #endif +#undef BGEMM_ALIGN_K +#undef BGEMM_DEFAULT_UNROLL_M +#undef BGEMM_DEFAULT_UNROLL_N +#define BGEMM_ALIGN_K 4 +#define BGEMM_DEFAULT_UNROLL_M 8 +#define BGEMM_DEFAULT_UNROLL_N 4 + #undef SBGEMM_ALIGN_K #undef SBGEMM_DEFAULT_UNROLL_M #undef SBGEMM_DEFAULT_UNROLL_N -#define SBGEMM_ALIGN_K 8 -#define SBGEMM_DEFAULT_UNROLL_M 4 +#define SBGEMM_ALIGN_K 4 +#define SBGEMM_DEFAULT_UNROLL_M 8 #define SBGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_UNROLL_M 16 @@ -3592,7 +3660,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#elif defined(NEOVERSEN2) +#elif defined(NEOVERSEN2) || defined(NEOVERSEV2) #if defined(XDOUBLE) || defined(DOUBLE) #define SWITCH_RATIO 8 @@ -3600,11 +3668,17 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define SWITCH_RATIO 16 #endif -#undef SBGEMM_ALIGN_K -#define SBGEMM_ALIGN_K 4 +#undef BGEMM_ALIGN_K +#undef BGEMM_DEFAULT_UNROLL_M +#undef BGEMM_DEFAULT_UNROLL_N +#define BGEMM_ALIGN_K 4 +#define BGEMM_DEFAULT_UNROLL_M 8 +#define BGEMM_DEFAULT_UNROLL_N 4 +#undef SBGEMM_ALIGN_K #undef SBGEMM_DEFAULT_UNROLL_M #undef SBGEMM_DEFAULT_UNROLL_N +#define SBGEMM_ALIGN_K 4 #define SBGEMM_DEFAULT_UNROLL_M 8 #define SBGEMM_DEFAULT_UNROLL_N 4 @@ -3635,8 +3709,51 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 +#elif defined(AMPERE1) + +#if defined(XDOUBLE) || defined(DOUBLE) +#define SWITCH_RATIO 8 +#else +#define SWITCH_RATIO 16 +#endif + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 240 +#define DGEMM_DEFAULT_P 240 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 640 +#define DGEMM_DEFAULT_Q 320 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + #elif defined(A64FX) // 512-bit SVE +#define GEMM_DIVIDE_RATE 1 + +#if defined(XDOUBLE) || defined(DOUBLE) +#define GEMM_PREFERED_SIZE 8 +#else +#define GEMM_PREFERED_SIZE 16 +#endif + /* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl". Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */ #define SGEMM_DEFAULT_UNROLL_M 4 @@ -3661,18 +3778,18 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #define ZGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_MN 16 -#define SGEMM_DEFAULT_P 128 -#define DGEMM_DEFAULT_P 160 +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 128 #define ZGEMM_DEFAULT_P 128 -#define SGEMM_DEFAULT_Q 352 -#define DGEMM_DEFAULT_Q 128 +#define SGEMM_DEFAULT_Q 896 +#define DGEMM_DEFAULT_Q 448 #define CGEMM_DEFAULT_Q 224 #define ZGEMM_DEFAULT_Q 112 -#define SGEMM_DEFAULT_R 4096 -#define DGEMM_DEFAULT_R 4096 +#define SGEMM_DEFAULT_R 3072 +#define DGEMM_DEFAULT_R 3072 #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 @@ -3747,8 +3864,12 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #endif /* ARMv8 */ -#if defined(ARMV9SME) /* ARMv9 SME */ +#if defined(ARMV9SME) || (defined(VORTEXM4)&&defined(__clang__)) /* ARMv9 SME */ #define USE_SGEMM_KERNEL_DIRECT 1 +#define USE_SSYMM_KERNEL_DIRECT 1 +#define USE_STRMM_KERNEL_DIRECT 1 +#define USE_SSYRK_KERNEL_DIRECT 1 +#define USE_SSYR2K_KERNEL_DIRECT 1 #endif /* ARMv9 SME */ #if defined(ARMV5) @@ -4046,8 +4167,6 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 -#define CGEMM3M_DEFAULT_UNROLL_N 2 -#define ZGEMM3M_DEFAULT_UNROLL_N 2 #ifdef ARCH_X86 #define SGEMM_DEFAULT_UNROLL_M 2 @@ -4063,8 +4182,11 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #define CGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define XGEMM_DEFAULT_UNROLL_M 1 +#endif #define CGEMM3M_DEFAULT_UNROLL_M 2 #define ZGEMM3M_DEFAULT_UNROLL_M 2 +#define CGEMM3M_DEFAULT_UNROLL_N 2 +#define ZGEMM3M_DEFAULT_UNROLL_N 2 #define CGEMM3M_DEFAULT_P 448 #define ZGEMM3M_DEFAULT_P 224 #define XGEMM3M_DEFAULT_P 112 @@ -4075,7 +4197,7 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #define ZGEMM3M_DEFAULT_R 12288 #define XGEMM3M_DEFAULT_R 12288 -#endif + #ifdef ARCH_MIPS #define SGEMM_DEFAULT_P 128 diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index f874fa5eaa..e3491d7f11 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -36,6 +36,28 @@ foreach(test_bin ${OpenBLAS_Tests}) target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME}) endforeach() +if (BUILD_BFLOAT16) + add_executable(test_bgemm compare_sgemm_bgemm.c) + target_compile_definitions(test_bgemm PUBLIC -DIBFLOAT16 -DOBFLOAT16) + target_link_libraries(test_bgemm ${OpenBLAS_LIBNAME}) + add_executable(test_bgemv compare_sgemv_bgemv.c) + target_compile_definitions(test_bgemv PUBLIC -DIBFLOAT16 -DOBFLOAT16) + target_link_libraries(test_bgemv ${OpenBLAS_LIBNAME}) + add_executable(test_sbgemm compare_sgemm_sbgemm.c) + target_compile_definitions(test_sbgemm PUBLIC -DIBFLOAT16) + target_link_libraries(test_sbgemm ${OpenBLAS_LIBNAME}) + add_executable(test_sbgemv compare_sgemv_sbgemv.c) + target_compile_definitions(test_sbgemv PUBLIC -DIBFLOAT16) + target_link_libraries(test_sbgemv ${OpenBLAS_LIBNAME}) +endif() + +if (BUILD_HFLOAT16) + add_executable(test_shgemm compare_sgemm_shgemm.c) + target_link_libraries(test_shgemm ${OpenBLAS_LIBNAME}) + add_executable(test_shgemv compare_sgemv_shgemv.c) + target_link_libraries(test_shgemv ${OpenBLAS_LIBNAME}) +endif() + # $1 exec, $2 input, $3 output_result if(WIN32) FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.ps1 @@ -94,3 +116,21 @@ add_test(NAME "${float_type}blas3_3m" endif() endif() endforeach() + +if (BUILD_BFLOAT16) + add_test(NAME "bgemm" + COMMAND $) + add_test(NAME "bgemv" + COMMAND $) + add_test(NAME "sbgemm" + COMMAND $) + add_test(NAME "sbgemv" + COMMAND $) +endif() + +if (BUILD_HFLOAT16) + add_test(NAME "shgemm" + COMMAND $) + add_test(NAME "shgemv" + COMMAND $) +endif() diff --git a/test/Makefile b/test/Makefile index 9ba88988b2..8b69976f76 100644 --- a/test/Makefile +++ b/test/Makefile @@ -1,3 +1,31 @@ +############################################################################### +# Copyright (c) 2025, The OpenBLAS Project +# All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# 3. Neither the name of the OpenBLAS project nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +############################################################################### TOPDIR = .. include ../Makefile.system ifeq ($(F_COMPILER),GFORTRAN) @@ -6,6 +34,7 @@ ifneq (, $(filter $(CORE),LOONGSON3R3 LOONGSON3R4)) endif override FFLAGS += -fno-tree-vectorize endif +# override CFLAGS += -std=c11 -Wall -Werror SUPPORT_GEMM3M = 0 @@ -90,6 +119,13 @@ endif endif endif +ifeq ($(BUILD_HFLOAT16), 1) +SH2 = test_shgemv +endif +ifeq ($(BUILD_BFLOAT16), 1) +BB2 = test_bgemv +B2 = test_sbgemv +endif ifeq ($(BUILD_SINGLE),1) S2=sblat2 endif @@ -103,11 +139,21 @@ ifeq ($(BUILD_COMPLEX16),1) Z2=zblat2 endif -level2: $(S2) $(D2) $(C2) $(Z2) +level2: $(SH2) $(BB2) $(B2) $(S2) $(D2) $(C2) $(Z2) ifneq ($(CROSS), 1) rm -f ?BLAT2.SUMM +ifeq ($(BUILD_BFLOAT16),1) + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_bgemv > BBLAT2.SUMM + @$(GREP) -q FATAL BBLAT2.SUMM && cat BBLAT2.SUMM || exit 0 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_sbgemv > SBBLAT2.SUMM + @$(GREP) -q FATAL SBBLAT2.SUMM && cat SBBLAT2.SUMM || exit 0 +endif +ifeq ($(BUILD_HFLOAT16),1) + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_shgemv > SHBLAT2.SUMM + @$(GREP) -q FATAL SHBLAT2.SUMM && cat SHBLAT2.SUMM || exit 0 +endif ifeq ($(BUILD_SINGLE),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat2 < ./sblat2.dat @$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0 @@ -127,6 +173,16 @@ endif ifdef SMP rm -f ?BLAT2.SUMM ifeq ($(USE_OPENMP), 1) +ifeq ($(BUILD_BFLOAT16),1) + OMP_NUM_THREADS=2 ./test_bgemv > BBLAT2.SUMM + @$(GREP) -q FATAL BBLAT2.SUMM && cat BBLAT2.SUMM || exit 0 + OMP_NUM_THREADS=2 ./test_sbgemv > SBBLAT2.SUMM + @$(GREP) -q FATAL SBBLAT2.SUMM && cat SBBLAT2.SUMM || exit 0 +endif +ifeq ($(BUILD_HFLOAT16),1) + OMP_NUM_THREADS=2 ./test_shgemv > SHBLAT2.SUMM + @$(GREP) -q FATAL SHBLAT2.SUMM && cat SHBLAT2.SUMM || exit 0 +endif ifeq ($(BUILD_SINGLE),1) OMP_NUM_THREADS=2 ./sblat2 < ./sblat2.dat @$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0 @@ -144,6 +200,16 @@ ifeq ($(BUILD_COMPLEX16),1) @$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0 endif else +ifeq ($(BUILD_BFLOAT16),1) + OMP_NUM_THREADS=2 ./test_bgemv > BBLAT2.SUMM + @$(GREP) -q FATAL BBLAT2.SUMM && cat BBLAT2.SUMM || exit 0 + OMP_NUM_THREADS=2 ./test_sbgemv > SBBLAT2.SUMM + @$(GREP) -q FATAL SBBLAT2.SUMM && cat SBBLAT2.SUMM || exit 0 +endif +ifeq ($(BUILD_HFLOAT16),1) + OMP_NUM_THREADS=2 ./test_shgemv > SHBLAT2.SUMM + @$(GREP) -q FATAL SHBLAT2.SUMM && cat SHBLAT2.SUMM || exit 0 +endif ifeq ($(BUILD_SINGLE),1) OPENBLAS_NUM_THREADS=2 ./sblat2 < ./sblat2.dat @$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0 @@ -165,7 +231,11 @@ endif endif ifeq ($(BUILD_BFLOAT16),1) -B3= test_sbgemm +BF3= test_bgemm +B3 = test_sbgemm +endif +ifeq ($(BUILD_HFLOAT16),1) +H3 = test_shgemm endif ifeq ($(BUILD_SINGLE),1) S3=sblat3 @@ -190,9 +260,9 @@ endif ifeq ($(SUPPORT_GEMM3M),1) -level3: $(B3) $(S3) $(D3) $(C3) $(Z3) level3_3m +level3: $(BF3) $(B3) $(H3) $(S3) $(D3) $(C3) $(Z3) level3_3m else -level3: $(B3) $(S3) $(D3) $(C3) $(Z3) +level3: $(BF3) $(B3) $(H3) $(S3) $(D3) $(C3) $(Z3) endif ifneq ($(CROSS), 1) @@ -200,6 +270,8 @@ ifneq ($(CROSS), 1) ifeq ($(BUILD_BFLOAT16),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_sbgemm > SBBLAT3.SUMM @$(GREP) -q FATAL SBBLAT3.SUMM && cat SBBLAT3.SUMM || exit 0 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_bgemm > BBLAT3.SUMM + @$(GREP) -q FATAL BBLAT3.SUMM && cat BBLAT3.SUMM || exit 0 endif ifeq ($(BUILD_SINGLE),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat3 < ./sblat3.dat @@ -223,6 +295,8 @@ ifeq ($(USE_OPENMP), 1) ifeq ($(BUILD_BFLOAT16),1) OMP_NUM_THREADS=2 ./test_sbgemm > SBBLAT3.SUMM @$(GREP) -q FATAL SBBLAT3.SUMM && cat SBBLAT3.SUMM || exit 0 + OMP_NUM_THREADS=2 ./test_bgemm > BBLAT3.SUMM + @$(GREP) -q FATAL BBLAT3.SUMM && cat BBLAT3.SUMM || exit 0 endif ifeq ($(BUILD_SINGLE),1) OMP_NUM_THREADS=2 ./sblat3 < ./sblat3.dat @@ -244,6 +318,8 @@ else ifeq ($(BUILD_BFLOAT16),1) OPENBLAS_NUM_THREADS=2 ./test_sbgemm > SBBLAT3.SUMM @$(GREP) -q FATAL SBBLAT3.SUMM && cat SBBLAT3.SUMM || exit 0 + OPENBLAS_NUM_THREADS=2 ./test_bgemm > BBLAT3.SUMM + @$(GREP) -q FATAL BBLAT3.SUMM && cat BBLAT3.SUMM || exit 0 endif ifeq ($(BUILD_SINGLE),1) OPENBLAS_NUM_THREADS=2 ./sblat3 < ./sblat3.dat @@ -367,10 +443,28 @@ zblat3 : zblat3.$(SUFFIX) ../$(LIBNAME) endif ifeq ($(BUILD_BFLOAT16),1) -test_sbgemm : compare_sgemm_sbgemm.c ../$(LIBNAME) - $(CC) $(CLDFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +test_bgemm : compare_sgemm_bgemm.c test_helpers.h ../$(LIBNAME) + $(CC) $(CLDFLAGS) -DIBFLOAT16 -DOBFLOAT16 -o test_bgemm compare_sgemm_bgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +test_bgemv : compare_sgemv_bgemv.c ../$(LIBNAME) + $(CC) $(CLDFLAGS) -DIBFLOAT16 -DOBFLOAT16 -o test_bgemv compare_sgemv_bgemv.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +test_sbgemm : compare_sgemm_sbgemm.c test_helpers.h ../$(LIBNAME) + $(CC) $(CLDFLAGS) -DIBFLOAT16 -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +test_sbgemv : compare_sgemv_sbgemv.c ../$(LIBNAME) + $(CC) $(CLDFLAGS) -DIBFLOAT16 -o test_sbgemv compare_sgemv_sbgemv.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) endif +ifeq ($(BUILD_HFLOAT16),1) +test_shgemm : compare_sgemm_shgemm.c test_helpers.h ../$(LIBNAME) + $(CC) $(CLDFLAGS) -DIHFLOAT16 -o test_shgemm compare_sgemm_shgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +test_shgemv : compare_sgemv_shgemv.c ../$(LIBNAME) + $(CC) $(CLDFLAGS) -o test_shgemv compare_sgemv_shgemv.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +endif + + ifeq ($(BUILD_COMPLEX),1) cblat3_3m : cblat3_3m.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o cblat3_3m cblat3_3m.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) @@ -387,7 +481,7 @@ clean: @rm -f *.$(SUFFIX) *.$(PSUFFIX) gmon.$(SUFFIX)ut *.SUMM *.cxml *.exe *.pdb *.dwf \ sblat1 dblat1 cblat1 zblat1 \ sblat2 dblat2 cblat2 zblat2 \ - test_sbgemm sblat3 dblat3 cblat3 zblat3 \ + test_bgemm test_bgemv test_sbgemm test_sbgemv test_shgemm test_shgemv sblat3 dblat3 cblat3 zblat3 \ sblat1p dblat1p cblat1p zblat1p \ sblat2p dblat2p cblat2p zblat2p \ sblat3p dblat3p cblat3p zblat3p \ diff --git a/test/compare_sgemm_bgemm.c b/test/compare_sgemm_bgemm.c new file mode 100644 index 0000000000..1fe4501e37 --- /dev/null +++ b/test/compare_sgemm_bgemm.c @@ -0,0 +1,166 @@ +/*************************************************************************** +Copyright (c) 2025 The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#include "../common.h" +#include +#include + +#include "test_helpers.h" + +#define SGEMM BLASFUNC(sgemm) +#define BGEMM BLASFUNC(bgemm) +#define BGEMM_LARGEST 256 + +int +main (int argc, char *argv[]) +{ + blasint m, n, k; + int i, j, l; + blasint x, y; + blasint one = 1; + int ret = 0; + int loop = BGEMM_LARGEST; + char transA = 'N', transB = 'N'; + float alpha = 1.0, beta = 1.0; + bfloat16 alpha_bf16; + sbstobf16_(&one, &alpha, &one, &alpha_bf16, &one); + bfloat16 beta_bf16; + sbstobf16_(&one, &beta, &one, &beta_bf16, &one); + + for (x = 0; x <= loop; x++) + { + if ((x > 100) && (x != BGEMM_LARGEST)) continue; + m = k = n = x; + float *A = (float *)malloc_safe(m * k * sizeof(FLOAT)); + float *B = (float *)malloc_safe(k * n * sizeof(FLOAT)); + float *C = (float *)malloc_safe(m * n * sizeof(FLOAT)); + bfloat16 *AA = (bfloat16 *)malloc_safe(m * k * sizeof(bfloat16)); + bfloat16 *BB = (bfloat16 *)malloc_safe(k * n * sizeof(bfloat16)); + bfloat16 *CC = (bfloat16 *)malloc_safe(k * n * sizeof(bfloat16)); + FLOAT *DD = (FLOAT *)malloc_safe(m * n * sizeof(FLOAT)); + if ((A == NULL) || (B == NULL) || (C == NULL) || (AA == NULL) || (BB == NULL) || + (DD == NULL) || (CC == NULL)) + return 1; + + for (j = 0; j < m; j++) + { + for (i = 0; i < k; i++) + { + A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + sbstobf16_(&one, &A[j*k+i], &one, &AA[j * k + i], &one); + } + } + for (j = 0; j < n; j++) + { + for (i = 0; i < k; i++) + { + B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + sbstobf16_(&one, &B[j*k+i], &one, &BB[j * k + i], &one); + } + } + for (y = 0; y < 4; y++) + { + if ((y == 0) || (y == 2)) { + transA = 'N'; + } else { + transA = 'T'; + } + if ((y == 0) || (y == 1)) { + transB = 'N'; + } else { + transB = 'T'; + } + + for (j = 0; j < m; j++) + { + for (i = 0; i < n; i++) + { + C[j * n + i] = 100.0; + DD[j * n + i] = 100.0; + sbstobf16_(&one, &C[j * n + i], &one, &CC[j * n + i], &one); + } + } + + SGEMM (&transA, &transB, &m, &n, &k, &alpha, A, + &m, B, &k, &beta, C, &m); + BGEMM (&transA, &transB, &m, &n, &k, &alpha_bf16, (bfloat16*)AA, + &m, (bfloat16*)BB, &k, &beta_bf16, (bfloat16*)CC, &m); + + for (i = 0; i < n; i++) + for (j = 0; j < m; j++) + { + for (l = 0; l < k; l++) + if (transA == 'N' && transB == 'N') + { + DD[i * m + j] += + float16to32 (AA[l * m + j]) * float16to32 (BB[l + k * i]); + } else if (transA == 'T' && transB == 'N') + { + DD[i * m + j] += + float16to32 (AA[k * j + l]) * float16to32 (BB[l + k * i]); + } else if (transA == 'N' && transB == 'T') + { + DD[i * m + j] += + float16to32 (AA[l * m + j]) * float16to32 (BB[i + l * n]); + } else if (transA == 'T' && transB == 'T') + { + DD[i * m + j] += + float16to32 (AA[k * j + l]) * float16to32 (BB[i + l * n]); + } + if (!is_close(float16to32(CC[i * m + j]), truncate_float32_to_bfloat16(C[i * m + j]), 0.01, 0.001)) { +#ifdef DEBUG + printf("Mismatch at i=%d, j=%d, k=%d: CC=%.6f, C=%.6f\n", + i, j, k, float16to32(CC[i * m + j]), truncate_float32_to_bfloat16(C[i * m + j])); +#endif + ret++; + } + + if (!is_close(float16to32(CC[i * m + j]), truncate_float32_to_bfloat16(DD[i * m + j]), 0.01, 0.001)) { +#ifdef DEBUG + printf("Mismatch at i=%d, j=%d, k=%d: CC=%.6f, DD=%.6f\n", + i, j, k, float16to32(CC[i * m + j]), truncate_float32_to_bfloat16(DD[i * m + j])); +#endif + ret++; + } + + } + } + free(A); + free(B); + free(C); + free(AA); + free(BB); + free(CC); + free(DD); + } + + if (ret != 0) { + fprintf(stderr, "BGEMM FAILURES: %d\n", ret); + return 1; + } + + return ret; +} diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index ae109c1a56..e7a145f2d6 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -27,72 +27,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include "../common.h" + +#include "test_helpers.h" + #define SGEMM BLASFUNC(sgemm) #define SBGEMM BLASFUNC(sbgemm) #define SGEMV BLASFUNC(sgemv) #define SBGEMV BLASFUNC(sbgemv) -typedef union -{ - unsigned short v; -#if defined(_AIX) - struct __attribute__((packed)) -#else - struct -#endif - { -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - unsigned short s:1; - unsigned short e:8; - unsigned short m:7; -#else - unsigned short m:7; - unsigned short e:8; - unsigned short s:1; -#endif - } bits; -} bfloat16_bits; - -typedef union -{ - float v; -#if defined(_AIX) - struct __attribute__((packed)) -#else - struct -#endif - { -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - uint32_t s:1; - uint32_t e:8; - uint32_t m:23; -#else - uint32_t m:23; - uint32_t e:8; - uint32_t s:1; -#endif - } bits; -} float32_bits; - -float -float16to32 (bfloat16_bits f16) -{ - float32_bits f32; - f32.bits.s = f16.bits.s; - f32.bits.e = f16.bits.e; - f32.bits.m = (uint32_t) f16.bits.m << 16; - return f32.v; -} - #define SBGEMM_LARGEST 256 -void *malloc_safe(size_t size) -{ - if (size == 0) - return malloc(1); - else - return malloc(size); -} - int main (int argc, char *argv[]) { @@ -111,14 +54,13 @@ main (int argc, char *argv[]) float *A = (float *)malloc_safe(m * k * sizeof(FLOAT)); float *B = (float *)malloc_safe(k * n * sizeof(FLOAT)); float *C = (float *)malloc_safe(m * n * sizeof(FLOAT)); - bfloat16_bits *AA = (bfloat16_bits *)malloc_safe(m * k * sizeof(bfloat16_bits)); - bfloat16_bits *BB = (bfloat16_bits *)malloc_safe(k * n * sizeof(bfloat16_bits)); + bfloat16 *AA = (bfloat16 *)malloc_safe(m * k * sizeof(bfloat16)); + bfloat16 *BB = (bfloat16 *)malloc_safe(k * n * sizeof(bfloat16)); float *DD = (float *)malloc_safe(m * n * sizeof(FLOAT)); float *CC = (float *)malloc_safe(m * n * sizeof(FLOAT)); if ((A == NULL) || (B == NULL) || (C == NULL) || (AA == NULL) || (BB == NULL) || (DD == NULL) || (CC == NULL)) return 1; - bfloat16 atmp,btmp; blasint one=1; for (j = 0; j < m; j++) @@ -126,8 +68,7 @@ main (int argc, char *argv[]) for (i = 0; i < k; i++) { A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; - sbstobf16_(&one, &A[j*k+i], &one, &atmp, &one); - AA[j * k + i].v = atmp; + sbstobf16_(&one, &A[j*k+i], &one, &AA[j * k + i], &one); } } for (j = 0; j < n; j++) @@ -135,8 +76,7 @@ main (int argc, char *argv[]) for (i = 0; i < k; i++) { B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; - sbstobf16_(&one, &B[j*k+i], &one, &btmp, &one); - BB[j * k + i].v = btmp; + sbstobf16_(&one, &B[j*k+i], &one, &BB[j * k + i], &one); } } for (y = 0; y < 4; y++) @@ -182,10 +122,12 @@ main (int argc, char *argv[]) DD[i * m + j] += float16to32 (AA[k * j + l]) * float16to32 (BB[i + l * n]); } - if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0) + if (!is_close(CC[i * m + j], C[i * m + j], 0.01, 0.001)) { ret++; - if (fabs (CC[i * m + j] - DD[i * m + j]) > 1.0) + } + if (!is_close(CC[i * m + j], DD[i * m + j], 0.001, 0.0001)) { ret++; + } } } free(A); @@ -198,89 +140,9 @@ main (int argc, char *argv[]) } if (ret != 0) { - fprintf (stderr, "FATAL ERROR SBGEMM - Return code: %d\n", ret); - return ret; + fprintf(stderr, "SBGEMM FAILURES: %d\n", ret); + return 1; } - for (beta = 0; beta < 3; beta += 1) { - for (alpha = 0; alpha < 3; alpha += 1) { - for (l = 0; l < 2; l++) { // l = 1 to test inc_x & inc_y not equal to one. - for (x = 1; x <= loop; x++) - { - k = (x == 0) ? 0 : l + 1; - float *A = (float *)malloc_safe(x * x * sizeof(FLOAT)); - float *B = (float *)malloc_safe(x * sizeof(FLOAT) << l); - float *C = (float *)malloc_safe(x * sizeof(FLOAT) << l); - bfloat16_bits *AA = (bfloat16_bits *)malloc_safe(x * x * sizeof(bfloat16_bits)); - bfloat16_bits *BB = (bfloat16_bits *)malloc_safe(x * sizeof(bfloat16_bits) << l); - float *DD = (float *)malloc_safe(x * sizeof(FLOAT)); - float *CC = (float *)malloc_safe(x * sizeof(FLOAT) << l); - if ((A == NULL) || (B == NULL) || (C == NULL) || (AA == NULL) || (BB == NULL) || - (DD == NULL) || (CC == NULL)) - return 1; - bfloat16 atmp, btmp; - blasint one = 1; - - for (j = 0; j < x; j++) - { - for (i = 0; i < x; i++) - { - A[j * x + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; - sbstobf16_(&one, &A[j*x+i], &one, &atmp, &one); - AA[j * x + i].v = atmp; - } - B[j << l] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; - sbstobf16_(&one, &B[j << l], &one, &btmp, &one); - BB[j << l].v = btmp; - - CC[j << l] = C[j << l] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; - } - - for (y = 0; y < 2; y++) - { - if (y == 0) { - transA = 'N'; - } else { - transA = 'T'; - } - - memset(CC, 0, x * sizeof(FLOAT) << l); - memset(DD, 0, x * sizeof(FLOAT)); - memset(C, 0, x * sizeof(FLOAT) << l); - - SGEMV (&transA, &x, &x, &alpha, A, &x, B, &k, &beta, C, &k); - SBGEMV (&transA, &x, &x, &alpha, (bfloat16*) AA, &x, (bfloat16*) BB, &k, &beta, CC, &k); - - for (int i = 0; i < x; i ++) DD[i] *= beta; - - for (j = 0; j < x; j++) - for (i = 0; i < x; i++) - if (transA == 'N') { - DD[i] += alpha * float16to32 (AA[j * x + i]) * float16to32 (BB[j << l]); - } else if (transA == 'T') { - DD[j] += alpha * float16to32 (AA[j * x + i]) * float16to32 (BB[i << l]); - } - - for (j = 0; j < x; j++) { - if (fabs (CC[j << l] - C[j << l]) > 1.0) - ret++; - if (fabs (CC[j << l] - DD[j]) > 1.0) - ret++; - } - } - free(A); - free(B); - free(C); - free(AA); - free(BB); - free(DD); - free(CC); - } // x - } // l - } // alpha - } // beta - - if (ret != 0) - fprintf (stderr, "FATAL ERROR SBGEMV - Return code: %d\n", ret); return ret; } diff --git a/test/compare_sgemm_shgemm.c b/test/compare_sgemm_shgemm.c new file mode 100644 index 0000000000..7a97a06697 --- /dev/null +++ b/test/compare_sgemm_shgemm.c @@ -0,0 +1,234 @@ +/*************************************************************************** +Copyright (c) 2020,2025 The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#include +#include +#include "../common.h" + +#include "test_helpers.h" + +#define SGEMM BLASFUNC(sgemm) +#define SHGEMM BLASFUNC(shgemm) +#define SHGEMM_LARGEST 256 + +int +main (int argc, char *argv[]) +{ + blasint m, n, k; + int i, j, l; + blasint x, y; + int ret = 0; + int rret = 0; + int loop = SHGEMM_LARGEST; + char transA = 'N', transB = 'N'; + float alpha = 1.0, beta = 0.0; + int xvals[6]={3,24,55,71,SHGEMM_LARGEST/2,SHGEMM_LARGEST}; + + for (x = 0; x <= loop; x++) + { + if ((x > 100) && (x != SHGEMM_LARGEST)) continue; + m = k = n = x; + float *A = (float *)malloc_safe(m * k * sizeof(FLOAT)); + float *B = (float *)malloc_safe(k * n * sizeof(FLOAT)); + float *C = (float *)malloc_safe(m * n * sizeof(FLOAT)); + _Float16 *AA = (_Float16 *)malloc_safe(m * k * sizeof(_Float16)); + _Float16 *BB = (_Float16 *)malloc_safe(k * n * sizeof(_Float16)); + float *DD = (float *)malloc_safe(m * n * sizeof(FLOAT)); + float *CC = (float *)malloc_safe(m * n * sizeof(FLOAT)); + if ((A == NULL) || (B == NULL) || (C == NULL) || (AA == NULL) || (BB == NULL) || + (DD == NULL) || (CC == NULL)) + return 1; + + for (j = 0; j < m; j++) + { + for (i = 0; i < k; i++) + { + A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + AA[j * k + i] = (_Float16) A[j * k + i]; + } + } + for (j = 0; j < n; j++) + { + for (i = 0; i < k; i++) + { + B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + BB[j * k + i] = (_Float16) B[j * k + i]; + } + } + for (y = 0; y < 4; y++) + { + if ((y == 0) || (y == 2)) { + transA = 'N'; + } else { + transA = 'T'; + } + if ((y == 0) || (y == 1)) { + transB = 'N'; + } else { + transB = 'T'; + } + + memset(CC, 0, m * n * sizeof(FLOAT)); + memset(DD, 0, m * n * sizeof(FLOAT)); + memset(C, 0, m * n * sizeof(FLOAT)); + + SGEMM (&transA, &transB, &m, &n, &k, &alpha, A, + &m, B, &k, &beta, C, &m); + SHGEMM (&transA, &transB, &m, &n, &k, &alpha, (_Float16*) AA, + &m, (_Float16*)BB, &k, &beta, CC, &m); + + for (i = 0; i < n; i++) + for (j = 0; j < m; j++) + { + for (l = 0; l < k; l++) + if (transA == 'N' && transB == 'N') + { + DD[i * m + j] += + (float) AA[l * m + j] * (float)BB[l + k * i]; + } else if (transA == 'T' && transB == 'N') + { + DD[i * m + j] += + (float)AA[k * j + l] * (float)BB[l + k * i]; + } else if (transA == 'N' && transB == 'T') + { + DD[i * m + j] += + (float)AA[l * m + j] * (float)BB[i + l * n]; + } else if (transA == 'T' && transB == 'T') + { + DD[i * m + j] += + (float)AA[k * j + l] * (float)BB[i + l * n]; + } + if (!is_close(CC[i * m + j], C[i * m + j], 0.01, 0.001)) { + fprintf(stderr,"CC %f C %f \n",(float)CC[i*m+j],C[i*m+j]); + ret++; + } + if (!is_close(CC[i * m + j], DD[i * m + j], 0.001, 0.0001)) { + fprintf(stderr,"CC %f DD %f \n",(float)CC[i*m+j],(float)DD[i*m+j]); + ret++; + } + } + } + free(A); + free(B); + free(C); + free(AA); + free(BB); + free(DD); + free(CC); + } + if (ret != 0) { + fprintf(stderr, "SHGEMM FAILURES: %d!!!\n", ret); + return 1; + } + + + for (loop = 0; loop<6; loop++) { + x=xvals[loop]; + for (alpha=0.;alpha<=1.;alpha+=0.5) + { + for (beta = 0.0; beta <=1.; beta+=0.5) { + + m = k = n = x; + float *A = (float *)malloc_safe(m * k * sizeof(FLOAT)); + float *B = (float *)malloc_safe(k * n * sizeof(FLOAT)); + float *C = (float *)malloc_safe(m * n * sizeof(FLOAT)); + _Float16 *AA = (_Float16 *)malloc_safe(m * k * sizeof(_Float16)); + _Float16 *BB = (_Float16 *)malloc_safe(k * n * sizeof(_Float16)); + float *CC = (float *)malloc_safe(m * n * sizeof(FLOAT)); + if ((A == NULL) || (B == NULL) || (C == NULL) || (AA == NULL) || (BB == NULL) || + (CC == NULL)) + return 1; + + for (j = 0; j < m; j++) + { + for (i = 0; i < k; i++) + { + A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + AA[j * k + i] = (_Float16) A[j * k + i]; + } + } + for (j = 0; j < n; j++) + { + for (i = 0; i < k; i++) + { + B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + BB[j * k + i] = (_Float16) B[j * k + i]; + } + } + + for (y = 0; y < 4; y++) + { + if ((y == 0) || (y == 2)) { + transA = 'N'; + } else { + transA = 'T'; + } + if ((y == 0) || (y == 1)) { + transB = 'N'; + } else { + transB = 'T'; + } + + memset(CC, 0, m * n * sizeof(FLOAT)); + memset(C, 0, m * n * sizeof(FLOAT)); + + SGEMM (&transA, &transB, &m, &n, &k, &alpha, A, + &m, B, &k, &beta, C, &m); + SHGEMM (&transA, &transB, &m, &n, &k, &alpha, (_Float16*) AA, + &m, (_Float16*)BB, &k, &beta, CC, &m); + + for (i = 0; i < n; i++) + for (j = 0; j < m; j++) + { + if (!is_close(CC[i * m + j], C[i * m + j], 0.01, 0.001)) { + ret++; + } + } + } + free(A); + free(B); + free(C); + free(AA); + free(BB); + free(CC); + + if (ret != 0) { +/* + * fprintf(stderr, "SHGEMM FAILURES FOR n=%d, alpha=%f beta=%f : %d\n", x, alpha, beta, ret); + */ + rret++; + ret=0; +/* } else { + fprintf(stderr, "SHGEMM SUCCEEDED FOR n=%d, alpha=%f beta=%f : %d\n", x, alpha, beta, ret); +*/ + } + } + + } + } + if (rret > 0) return(1); + return(0); +} diff --git a/test/compare_sgemv_bgemv.c b/test/compare_sgemv_bgemv.c new file mode 100644 index 0000000000..d9dc30d9a6 --- /dev/null +++ b/test/compare_sgemv_bgemv.c @@ -0,0 +1,156 @@ +/*************************************************************************** +Copyright (c) 2020,2025 The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#include +#include +#include "../common.h" + +#include "test_helpers.h" + +#define SGEMV BLASFUNC(sgemv) +#define BGEMV BLASFUNC(bgemv) +#define BGEMV_LARGEST 256 + +int main(int argc, char *argv[]) +{ + blasint k; + int i, j, l; + blasint x, y; + blasint one = 1; + int ret = 0; + int loop = BGEMV_LARGEST; + char transA = 'N'; + float alpha = 1.0, beta = 0.0; + bfloat16 alpha_bf16, beta_bf16; + + for (beta = 0; beta < 3; beta += 1) + { + for (alpha = 0; alpha < 3; alpha += 1) + { + for (l = 0; l < 2; l++) + { // l = 1 to test inc_x & inc_y not equal to one. + for (x = 1; x <= loop; x++) + { + k = (x == 0) ? 0 : l + 1; + float *A = (float *)malloc_safe(x * x * sizeof(FLOAT)); + float *B = (float *)malloc_safe(x * sizeof(FLOAT) << l); + float *C = (float *)malloc_safe(x * sizeof(FLOAT) << l); + bfloat16 *AA = (bfloat16 *)malloc_safe(x * x * sizeof(bfloat16)); + bfloat16 *BB = (bfloat16 *)malloc_safe(x * sizeof(bfloat16) << l); + bfloat16 *CC = (bfloat16 *)malloc_safe(x * sizeof(bfloat16) << l); + float *DD = (float *)malloc_safe(x * sizeof(FLOAT)); + if ((A == NULL) || (B == NULL) || (C == NULL) || (AA == NULL) || (BB == NULL) || + (CC == NULL) || (DD == NULL)) + return 1; + + for (j = 0; j < x; j++) + { + for (i = 0; i < x; i++) + { + A[j * x + i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) + 0.5; + sbstobf16_(&one, &A[j * x + i], &one, &AA[j * x + i], &one); + } + B[j << l] = ((FLOAT)rand() / (FLOAT)RAND_MAX) + 0.5; + sbstobf16_(&one, &B[j << l], &one, &BB[j << l], &one); + + C[j << l] = ((FLOAT)rand() / (FLOAT)RAND_MAX) + 0.5; + sbstobf16_(&one, &B[j << l], &one, &CC[j << l], &one); + } + + for (y = 0; y < 2; y++) + { + if (y == 0) + { + transA = 'N'; + } + else + { + transA = 'T'; + } + + memset(C, 0, x * sizeof(FLOAT) << l); + memset(CC, 0, x * sizeof(bfloat16) << l); + memset(DD, 0, x * sizeof(FLOAT)); + + sbstobf16_(&one, &alpha, &one, &alpha_bf16, &one); + sbstobf16_(&one, &beta, &one, &beta_bf16, &one); + SGEMV(&transA, &x, &x, &alpha, A, &x, B, &k, &beta, C, &k); + BGEMV(&transA, &x, &x, &alpha_bf16, AA, &x, BB, &k, &beta_bf16, CC, &k); + + for (i = 0; i < x; i++) + DD[i] *= beta; + + for (j = 0; j < x; j++) + for (i = 0; i < x; i++) + if (transA == 'N') + { + DD[i] += alpha * float16to32(AA[j * x + i]) * float16to32(BB[j << l]); + } + else if (transA == 'T') + { + DD[j] += alpha * float16to32(AA[j * x + i]) * float16to32(BB[i << l]); + } + + for (j = 0; j < x; j++) + { + if (!is_close(float16to32(CC[j << l]), truncate_float32_to_bfloat16(C[j << l]), 0.01, 0.001)) + { +#ifdef DEBUG + printf("Mismatch at trans=%c, alpha=%.2f, beta=%.2f, i=%d, j=%d, k=%d: CC=%.6f, C=%.6f\n", + transA, alpha, beta, i, j, k, float16to32(CC[j << l]), truncate_float32_to_bfloat16(C[j << l])); +#endif + ret++; + } + if (!is_close(float16to32(CC[j << l]), truncate_float32_to_bfloat16(DD[j]), 0.001, 0.0001)) + { +#ifdef DEBUG + printf("Mismatch at trans=%c, alpha=%.2f, beta=%.2f, i=%d, j=%d, k=%d: CC=%.6f, DD=%.6f\n", + transA, alpha, beta, i, j, k, float16to32(CC[j << l]), truncate_float32_to_bfloat16(DD[j])); +#endif + ret++; + } + } + } + + free(A); + free(B); + free(C); + free(AA); + free(BB); + free(CC); + free(DD); + } // x + } // l + } // alpha + } // beta + + if (ret != 0) { + fprintf(stderr, "BGEMV FAILURES: %d\n", ret); + return 1; + } + + return ret; +} diff --git a/test/compare_sgemv_sbgemv.c b/test/compare_sgemv_sbgemv.c new file mode 100644 index 0000000000..627cf7146c --- /dev/null +++ b/test/compare_sgemv_sbgemv.c @@ -0,0 +1,131 @@ +/*************************************************************************** +Copyright (c) 2020,2025 The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#include +#include +#include "../common.h" + +#include "test_helpers.h" + +#define SGEMV BLASFUNC(sgemv) +#define SBGEMV BLASFUNC(sbgemv) +#define SBGEMV_LARGEST 256 + +int +main (int argc, char *argv[]) +{ + blasint k; + int i, j, l; + blasint x, y; + int ret = 0; + int loop = SBGEMV_LARGEST; + char transA = 'N'; + float alpha = 1.0, beta = 0.0; + + for (beta = 0; beta < 3; beta += 1) { + for (alpha = 0; alpha < 3; alpha += 1) { + for (l = 0; l < 2; l++) { // l = 1 to test inc_x & inc_y not equal to one. + for (x = 1; x <= loop; x++) + { + k = (x == 0) ? 0 : l + 1; + float *A = (float *)malloc_safe(x * x * sizeof(FLOAT)); + float *B = (float *)malloc_safe(x * sizeof(FLOAT) << l); + float *C = (float *)malloc_safe(x * sizeof(FLOAT) << l); + bfloat16 *AA = (bfloat16 *)malloc_safe(x * x * sizeof(bfloat16)); + bfloat16 *BB = (bfloat16 *)malloc_safe(x * sizeof(bfloat16) << l); + float *CC = (float *)malloc_safe(x * sizeof(FLOAT) << l); + float *DD = (float *)malloc_safe(x * sizeof(FLOAT)); + if ((A == NULL) || (B == NULL) || (C == NULL) || (AA == NULL) || (BB == NULL) || + (DD == NULL) || (CC == NULL)) + return 1; + blasint one = 1; + + for (j = 0; j < x; j++) + { + for (i = 0; i < x; i++) + { + A[j * x + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + sbstobf16_(&one, &A[j*x+i], &one, &AA[j * x + i], &one); + } + B[j << l] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + sbstobf16_(&one, &B[j << l], &one, &BB[j << l], &one); + + CC[j << l] = C[j << l] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + } + + for (y = 0; y < 2; y++) + { + if (y == 0) { + transA = 'N'; + } else { + transA = 'T'; + } + + memset(CC, 0, x * sizeof(FLOAT) << l); + memset(DD, 0, x * sizeof(FLOAT)); + memset(C, 0, x * sizeof(FLOAT) << l); + + SGEMV (&transA, &x, &x, &alpha, A, &x, B, &k, &beta, C, &k); + SBGEMV (&transA, &x, &x, &alpha, (bfloat16*) AA, &x, (bfloat16*) BB, &k, &beta, CC, &k); + + for (int i = 0; i < x; i ++) DD[i] *= beta; + + for (j = 0; j < x; j++) + for (i = 0; i < x; i++) + if (transA == 'N') { + DD[i] += alpha * float16to32 (AA[j * x + i]) * float16to32 (BB[j << l]); + } else if (transA == 'T') { + DD[j] += alpha * float16to32 (AA[j * x + i]) * float16to32 (BB[i << l]); + } + + for (j = 0; j < x; j++) { + if (!is_close(CC[j << l], C[j << l], 0.01, 0.001)) { + ret++; + } + if (!is_close(CC[j << l], DD[j], 0.001, 0.0001)) { + ret++; + } + } + } + free(A); + free(B); + free(C); + free(AA); + free(BB); + free(DD); + free(CC); + } // x + } // l + } // alpha + } // beta + + if (ret != 0) { + fprintf(stderr, "SBGEMV FAILURES: %d\n", ret); + return 1; + } + + return ret; +} diff --git a/test/compare_sgemv_shgemv.c b/test/compare_sgemv_shgemv.c new file mode 100644 index 0000000000..9e92218acb --- /dev/null +++ b/test/compare_sgemv_shgemv.c @@ -0,0 +1,130 @@ +/*************************************************************************** +Copyright (c) 2020,2025 The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#include +#include +#include "../common.h" + +#include "test_helpers.h" + +#define SGEMV BLASFUNC(sgemv) +#define SHGEMV BLASFUNC(shgemv) +#define SHGEMV_LARGEST 256 + +int +main (int argc, char *argv[]) +{ + blasint k; + int i, j, l; + blasint x, y; + int ret = 0; + int loop = SHGEMV_LARGEST; + char transA = 'N'; + float alpha = 1.0, beta = 0.0; + + for (beta = 0; beta < 3; beta += 1) { + for (alpha = 0; alpha < 3; alpha += 1) { + for (l = 0; l < 2; l++) { // l = 1 to test inc_x & inc_y not equal to one. + for (x = 1; x <= loop; x++) + { + k = (x == 0) ? 0 : l + 1; + float *A = (float *)malloc_safe(x * x * sizeof(FLOAT)); + float *B = (float *)malloc_safe(x * sizeof(FLOAT) << l); + float *C = (float *)malloc_safe(x * sizeof(FLOAT) << l); + hfloat16 *AA = (hfloat16 *)malloc_safe(x * x * sizeof(hfloat16)); + hfloat16 *BB = (hfloat16 *)malloc_safe(x * sizeof(hfloat16) << l); + float *CC = (float *)malloc_safe(x * sizeof(FLOAT) << l); + float *DD = (float *)malloc_safe(x * sizeof(FLOAT)); + if ((A == NULL) || (B == NULL) || (C == NULL) || (AA == NULL) || (BB == NULL) || + (DD == NULL) || (CC == NULL)) + return 1; + + for (j = 0; j < x; j++) + { + for (i = 0; i < x; i++) + { + A[j * x + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + AA[j * x + i] = (_Float16)A[j * x + i]; + } + B[j << l] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + BB[j << l]= (_Float16)B[j << l]; + + CC[j << l] = C[j << l] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + } + + for (y = 0; y < 2; y++) + { + if (y == 0) { + transA = 'N'; + } else { + transA = 'T'; + } + + memset(CC, 0, x * sizeof(FLOAT) << l); + memset(DD, 0, x * sizeof(FLOAT)); + memset(C, 0, x * sizeof(FLOAT) << l); + + SGEMV (&transA, &x, &x, &alpha, A, &x, B, &k, &beta, C, &k); + SHGEMV (&transA, &x, &x, &alpha, (hfloat16*) AA, &x, (hfloat16*) BB, &k, &beta, CC, &k); + + for (int i = 0; i < x; i ++) DD[i] *= beta; + + for (j = 0; j < x; j++) + for (i = 0; i < x; i++) + if (transA == 'N') { + DD[i] += alpha * (float)(AA[j * x + i]) * (float)(BB[j << l]); + } else if (transA == 'T') { + DD[j] += alpha * (float)(AA[j * x + i]) * (float)(BB[i << l]); + } + + for (j = 0; j < x; j++) { + if (!is_close(CC[j << l], C[j << l], 0.01, 0.001)) { + ret++; + } + if (!is_close(CC[j << l], DD[j], 0.001, 0.0001)) { + ret++; + } + } + } + free(A); + free(B); + free(C); + free(AA); + free(BB); + free(DD); + free(CC); + } // x + } // l + } // alpha + } // beta + + if (ret != 0) { + fprintf (stderr, "SHGEMV FAILURES: %d\n", ret); + return 1; + } + + return ret; +} diff --git a/test/test_helpers.h b/test/test_helpers.h new file mode 100644 index 0000000000..fcec86e102 --- /dev/null +++ b/test/test_helpers.h @@ -0,0 +1,66 @@ +/*************************************************************************** +Copyright (c) 2025 The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#ifndef TEST_HELPERS_H +#define TEST_HELPERS_H +#include + +#include "../common.h" + +#ifdef IBFLOAT16 +static float float16to32(bfloat16 value) +{ + blasint one = 1; + float result; + sbf16tos_(&one, &value, &one, &result, &one); + return result; +} +#endif + +#ifdef OBFLOAT16 +static float truncate_float32_to_bfloat16(float value) { + blasint one = 1; + bfloat16 tmp; + float result; + sbstobf16_(&one, &value, &one, &tmp, &one); + sbf16tos_(&one, &tmp, &one, &result, &one); + return result; +} +#endif + +static void *malloc_safe(size_t size) { + if (size == 0) + return malloc(1); + else + return malloc(size); +} + +static bool is_close(float a, float b, float rtol, float atol) { + return fabs(a - b) <= (atol + rtol*fabs(b)); +} + +#endif diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 6a61899da5..c73152d79a 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -101,6 +101,7 @@ if (NOT USE_OPENMP) set(OpenBLAS_utest_src ${OpenBLAS_utest_src} test_fork.c + test_post_fork_async.c ) endif() set(OpenBLAS_utest_src diff --git a/utest/Makefile b/utest/Makefile index b82937093f..e92fb67dc1 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -45,7 +45,7 @@ endif # FIXME TBD if this works on OSX, SunOS, POWER and zarch ifeq ($(OSNAME), $(filter $(OSNAME),Linux CYGWIN_NT)) ifneq ($(USE_OPENMP), 1) -OBJS += test_fork.o +OBJS += test_fork.o test_post_fork_async.o endif OBJS += test_post_fork.o endif diff --git a/utest/test_post_fork_async.c b/utest/test_post_fork_async.c new file mode 100644 index 0000000000..c283a6455f --- /dev/null +++ b/utest/test_post_fork_async.c @@ -0,0 +1,94 @@ +/***************************************************************************** +Copyright (c) 2011-2025, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include +#include +#include +#include +#include "openblas_utest.h" + +static void* xmalloc(size_t n) +{ + void* tmp; + tmp = calloc(1, n); + if (tmp == NULL) { + fprintf(stderr, "Failed to allocate memory for the testcase.\n"); + exit(1); + } else { + return tmp; + } +} + +CTEST(fork, safety_after_fork_async) +{ +#ifdef __UCLIBC__ +#if !defined __UCLIBC_HAS_STUBS__ && !defined __ARCH_USE_MMU__ +exit(0); +#endif +#endif +#ifndef BUILD_DOUBLE +exit(0); +#else + blasint n = 200; + blasint info; + blasint i; + + blasint *ipiv; + double *arr; + + pid_t fork_pid; + + arr = xmalloc(sizeof(*arr) * n * n); + ipiv = xmalloc(sizeof(*ipiv) * n); + + // array is an identity matrix + for (i = 0; i < n*n; i += n + 1) { + arr[i] = 1.0; + } + + fork_pid = fork(); + if (fork_pid == -1) { + perror("fork"); + CTEST_ERR("Failed to fork process."); + } else if (fork_pid == 0) { + exit(0); + } else { + // Wait for the child to finish and check the exit code. + int child_status = 0; + pid_t wait_pid = wait(&child_status); + ASSERT_EQUAL(wait_pid, fork_pid); + ASSERT_EQUAL(0, WEXITSTATUS (child_status)); + } + BLASFUNC(dgetrf)(&n, &n, arr, &n, ipiv, &info); +#endif +}