Add SIMD audio processing interface and implementations

- Created a new SIMD interface header and source files for audio processing functions.
- Implemented functions for filling buffers, mixing audio, applying gain, calculating RMS and peak values, normalizing audio, converting stereo to mono, limiting audio, fading audio, and a simple equalizer.
- Added SSE-specific implementations for the audio processing functions to leverage SIMD for performance improvements.
- Updated CMakeLists.txt files to include new libraries and link dependencies for the SIMD interface and SSE implementations.
- Introduced a static test helper library for unit testing with Google Test framework.
This commit is contained in:
2025-11-14 23:27:55 +08:00
parent a96b6ce25c
commit 886b6843e6
53 changed files with 4326 additions and 7305 deletions

2
.gitignore vendored
View File

@@ -5,3 +5,5 @@
.DS_Store
/build/
/.vs
/out
/logs

View File

@@ -15,7 +15,6 @@ include(cmake/compiler_options.cmake)
include(cmake/mingw_dll.cmake)
configure_project_defaults()
configure_simd_optimizations()
setup_project_options(
STANDARD 20
INTERFACE_TARGET audio_backend_project_options

View File

@@ -15,6 +15,19 @@
"CMAKE_CXX_COMPILER": "cl.exe",
"CMAKE_TOOLCHAIN_FILE": "$env{VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake"
}
},
{
"name": "clang",
"displayName": "Clang 20.1.8 x86_64-pc-linux-gnu",
"description": "正在使用编译器: C = /usr/bin/clang, CXX = /usr/bin/clang++",
"binaryDir": "${sourceDir}/out/build/${presetName}",
"cacheVariables": {
"CMAKE_INSTALL_PREFIX": "${sourceDir}/out/install/${presetName}",
"CMAKE_C_COMPILER": "/usr/bin/clang",
"CMAKE_CXX_COMPILER": "/usr/bin/clang++",
"CMAKE_BUILD_TYPE": "Debug",
"CMAKE_TOOLCHAIN_FILE": "$env{VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake"
}
}
],
"buildPresets": [

12
CMakeUserPresets.json Normal file
View File

@@ -0,0 +1,12 @@
{
"version": 3,
"configurePresets": [
{
"name": "clang-local",
"inherits": "clang",
"environment": {
"VCPKG_ROOT": "/home/vcpkg"
}
}
]
}

View File

@@ -74,83 +74,19 @@ function(configure_compiler_options)
message(STATUS "启用彩色诊断和完整模板回溯")
endif()
endif()
endfunction()
# ================================================================================================
# 配置SIMD优化扩展AVX512支持
# ================================================================================================
function(configure_simd_optimizations)
if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64")
# 检测编译器支持
include(CheckCXXCompilerFlag)
# AVX2支持保持现有
if(MSVC)
check_cxx_compiler_flag("/arch:AVX2" COMPILER_SUPPORTS_AVX2)
if(COMPILER_SUPPORTS_AVX2)
add_compile_options(/arch:AVX2)
add_compile_definitions(DAW_ENABLE_AVX2)
message(STATUS "SIMD优化: 启用AVX2指令集")
endif()
else()
check_cxx_compiler_flag("-mavx2" COMPILER_SUPPORTS_AVX2)
if(COMPILER_SUPPORTS_AVX2)
add_compile_options(-mavx2 -mfma)
add_compile_definitions(DAW_ENABLE_AVX2)
message(STATUS "SIMD优化: 启用AVX2指令集")
endif()
endif()
# AVX512支持新增
if(MSVC)
check_cxx_compiler_flag("/arch:AVX512" COMPILER_SUPPORTS_AVX512)
if(COMPILER_SUPPORTS_AVX512)
add_compile_options(/arch:AVX512)
add_compile_definitions(DAW_ENABLE_AVX512)
message(STATUS "SIMD优化: 启用AVX512指令集")
else()
message(WARNING "编译器不支持AVX512降级到AVX2")
endif()
else()
check_cxx_compiler_flag("-mavx512f" COMPILER_SUPPORTS_AVX512F)
check_cxx_compiler_flag("-mavx512vl" COMPILER_SUPPORTS_AVX512VL)
check_cxx_compiler_flag("-mavx512bw" COMPILER_SUPPORTS_AVX512BW)
if(COMPILER_SUPPORTS_AVX512F AND COMPILER_SUPPORTS_AVX512VL)
add_compile_options(-mavx512f -mavx512vl)
add_compile_definitions(DAW_ENABLE_AVX512)
if(COMPILER_SUPPORTS_AVX512BW)
add_compile_options(-mavx512bw)
add_compile_definitions(DAW_ENABLE_AVX512BW)
endif()
message(STATUS "SIMD优化: 启用AVX512指令集")
else()
message(WARNING "编译器不支持完整AVX512降级到AVX2")
endif()
endif()
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64|ARM64")
# ARM平台检测NEON支持
include(CheckCXXCompilerFlag)
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|ARM64")
# AArch64: NEON默认可用
add_compile_definitions(DAW_ENABLE_NEON)
message(STATUS "SIMD优化: 启用ARM64 NEON指令集")
else()
# ARM32: 检测NEON支持
check_cxx_compiler_flag("-mfpu=neon" COMPILER_SUPPORTS_NEON)
if(COMPILER_SUPPORTS_NEON)
add_compile_options(-mfpu=neon)
add_compile_definitions(DAW_ENABLE_NEON)
message(STATUS "SIMD优化: 启用ARM32 NEON指令集")
else()
message(STATUS "SIMD优化: ARM32平台不支持NEON")
endif()
endif()
else()
message(STATUS "SIMD优化: 当前架构(${CMAKE_SYSTEM_PROCESSOR})不支持SIMD优化")
if (MSVC)
add_compile_definitions(ALICHO_MSVC=1)
add_compile_definitions(ALICHO_GCC=0)
add_compile_definitions(ALICHO_CLANG=0)
elseif (CMAKE_CXX_COMPILER_ID MATCHES "GNU")
add_compile_definitions(ALICHO_MSVC=0)
add_compile_definitions(ALICHO_GCC=1)
add_compile_definitions(ALICHO_CLANG=0)
elseif (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
add_compile_definitions(ALICHO_MSVC=0)
add_compile_definitions(ALICHO_GCC=0)
add_compile_definitions(ALICHO_CLANG=1)
endif()
endfunction()
@@ -159,6 +95,5 @@ endfunction()
# ================================================================================================
function(apply_compiler_configuration)
configure_compiler_options()
configure_simd_optimizations()
message(STATUS "编译器配置完成")
endfunction()

View File

@@ -142,7 +142,12 @@ function(add_os_definitions target)
# --- 阶段 3: 应用所有定义 ---
# **关键:使用一次调用将所有定义添加到目标**
if(definitions_list) # 确保列表非空
target_compile_definitions(${target} PUBLIC ${definitions_list})
get_target_property(target_type ${target} TYPE)
if(target_type STREQUAL "INTERFACE_LIBRARY")
target_compile_definitions(${target} INTERFACE ${definitions_list})
else()
target_compile_definitions(${target} PUBLIC ${definitions_list})
endif()
endif()
# 函数作用域结束时alicho_def_* 变量会自动销毁,无需显式 unset

View File

@@ -567,8 +567,13 @@ function(simple_library library_type)
set(source_files "")
retrieve_files(${CMAKE_CURRENT_SOURCE_DIR} source_files)
add_library(${PROJECT_NAME} ${library_type} ${source_files})
target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
target_link_libraries(${PROJECT_NAME} PUBLIC audio_backend_project_options)
if(library_type STREQUAL "INTERFACE")
target_include_directories(${PROJECT_NAME} INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
target_link_libraries(${PROJECT_NAME} INTERFACE audio_backend_project_options)
else()
target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
target_link_libraries(${PROJECT_NAME} PUBLIC audio_backend_project_options)
endif()
message(STATUS "创建库目标: ${PROJECT_NAME},类型: ${library_type},引用路径: ${CMAKE_CURRENT_SOURCE_DIR}")
add_os_definitions(${PROJECT_NAME})
endfunction()

File diff suppressed because it is too large Load Diff

View File

@@ -40,6 +40,7 @@
#include <string>
#include <unordered_map>
#include <vector>
#include <atomic>
// 前向声明
class audio_processing_task;

View File

@@ -4,6 +4,7 @@
#include "lazy_singleton.h"
#include <unordered_map>
#include <vector>
#include "plugin_type.h"

View File

@@ -2,6 +2,7 @@
#include <cstdint>
#include <span>
#include <atomic>
#include <vector>
#include "aligned_allocator.h"
#include "transport/audio_processing_shm.h"

39
src/misc/lib_handle.h Normal file
View File

@@ -0,0 +1,39 @@
#pragma once
#include <filesystem>
#include <functional>
class lib_handle {
public:
lib_handle() = default;
~lib_handle() { close(); }
auto open(const std::filesystem::path& lib_path) -> bool;
void close();
// 通过函数名称和函数签名获取函数指针
template<typename Func>
auto get_function_by_name(const std::string& func_name) -> std::function<Func> {
auto raw_func_ptr = get_raw_function(func_name);
if (!raw_func_ptr) {
return nullptr;
}
// 将原始函数指针转换为函数指针类型
// Func 是函数签名,如 int(float, double)
// Func* 是对应的函数指针类型,如 int(*)(float, double)
using func_ptr_type = Func*;
auto typed_func_ptr = reinterpret_cast<func_ptr_type>(raw_func_ptr);
return std::function<Func>(typed_func_ptr);
}
private:
auto get_raw_function(const std::string& func_name) -> void*;
void* handle_{nullptr};
};
// 通过函数签名自动推导类型并获取函数
// 用法示例auto func = get_function_by_func_signature(handle, my_function);
// 其中 my_function 是实际的函数名称
#define get_function_by_func_signature(lib_handle_instance, func_signature) \
(lib_handle_instance).get_function_by_name<std::remove_pointer_t<decltype(&func_signature)>>(#func_signature)

View File

@@ -0,0 +1,24 @@
#include "lib_handle.h"
#include <dlfcn.h>
auto lib_handle::open(const std::filesystem::path& lib_path) -> bool {
close();
handle_ = dlopen(lib_path.c_str(), RTLD_LAZY);
return handle_ != nullptr;
}
void lib_handle::close() {
if (handle_) {
dlclose(handle_);
handle_ = nullptr;
}
}
auto lib_handle::get_raw_function(const std::string& func_name) -> void* {
if (!handle_) {
return nullptr;
}
return dlsym(handle_, func_name.c_str());
}

View File

@@ -16,41 +16,14 @@
#include "thread_tool.h"
#include <pthread.h>
#include <sched.h>
#include <cstring>
#include <cerrno>
#include "logger.h"
/**
* @brief 设置线程CPU亲和性Linux占位实现
*
* 当前返回false表示功能未实现。
*
* ## 计划实现
*
* 完整实现应该使用pthread_setaffinity_np或sched_setaffinity
*
* @code
* #include <pthread.h>
* #include <sched.h>
*
* bool thread_set_affinity(boost::thread& thread, int core_id) {
* cpu_set_t cpuset;
* CPU_ZERO(&cpuset); // 清空CPU集合
* CPU_SET(core_id, &cpuset); // 设置指定的CPU核心
*
* // 使用pthread API设置亲和性
* int result = pthread_setaffinity_np(
* thread.native_handle(), // pthread线程句柄
* sizeof(cpu_set_t), // CPU集合大小
* &cpuset // CPU集合指针
* );
*
* if (result != 0) {
* log_module_error(THREAD_TOOL_LOG_MODULE,
* "无法设置线程亲和性到核心{}: {}",
* core_id, strerror(result));
* return false;
* }
* return true;
* }
* @endcode
*
* ### API说明
* - cpu_set_tCPU集合类型表示一组CPU核心
* - CPU_ZERO清空CPU集合的所有位
@@ -70,47 +43,30 @@
* @todo 实现实际的线程亲和性设置功能
*/
bool thread_set_affinity(boost::thread& thread, int core_id) {
// Linux implementation can be added here
// TODO: 使用pthread_setaffinity_np或sched_setaffinity实现
return false; // Placeholder - 占位实现,表示功能未实现
// 创建CPU集合
cpu_set_t cpuset;
CPU_ZERO(&cpuset); // 清空CPU集合
CPU_SET(core_id, &cpuset); // 设置指定的CPU核心
// 使用pthread API设置亲和性
const int result = pthread_setaffinity_np(
thread.native_handle(), // pthread线程句柄
sizeof(cpu_set_t), // CPU集合大小
&cpuset // CPU集合指针
);
if (result != 0) {
log_module_error(THREAD_TOOL_LOG_MODULE,
"无法将线程亲和性设置为核心{}: {}",
core_id, strerror(result));
return false;
}
return true;
}
/**
* @brief 设置线程名称Linux占位实现
*
* 当前返回false表示功能未实现。
*
* ## 计划实现
*
* 完整实现应该使用pthread_setname_np
*
* @code
* #include <pthread.h>
* #include <cstring>
*
* bool thread_set_name(boost::thread& thread, const char* name) {
* // Linux限制线程名称最长为15个字符不含null终止符
* // 因此需要截断过长的名称
* char truncated_name[16]; // 15字符 + null终止符
* strncpy(truncated_name, name, 15);
* truncated_name[15] = '\0';
*
* // 使用pthread API设置线程名称
* int result = pthread_setname_np(
* thread.native_handle(), // pthread线程句柄
* truncated_name // 线程名称最长15字符
* );
*
* if (result != 0) {
* log_module_error(THREAD_TOOL_LOG_MODULE,
* "无法设置线程名称为 {}: {}",
* name, strerror(result));
* return false;
* }
* return true;
* }
* @endcode
*
* ### API说明
* - pthread_setname_npLinux特定的线程命名API
* - 线程名称限制为15个字符不包括null终止符
@@ -138,8 +94,24 @@ bool thread_set_affinity(boost::thread& thread, int core_id) {
* @todo 添加名称长度检查和截断逻辑
*/
bool thread_set_name(boost::thread& thread, const char* name) {
// Linux implementation can be added here
// TODO: 使用pthread_setname_np实现
// 注意Linux限制线程名称最长为15个字
return false; // Placeholder - 占位实现,表示功能未实现
// Linux限制线程名称最长为15个字符不含null终止符
// 因此需要截断过长的名称
char truncated_name[16]; // 15字符 + null终止
strncpy(truncated_name, name, 15);
truncated_name[15] = '\0';
// 使用pthread API设置线程名称
const int result = pthread_setname_np(
thread.native_handle(), // pthread线程句柄
truncated_name // 线程名称最长15字符
);
if (result != 0) {
log_module_error(THREAD_TOOL_LOG_MODULE,
"无法设置线程名称为 {}: {}",
name, strerror(result));
return false;
}
return true;
}

View File

@@ -0,0 +1,203 @@
/**
* @file lib_handle.cpp
* @brief macOS平台动态库加载工具实现
*
* 实现了lib_handle.h中声明的跨平台动态库加载工具的macOS版本。
* 使用POSIX标准的dlopen/dlclose/dlsym API来实现动态库的加载、卸载和函数查找功能。
* 这些API在macOS上通过dyld动态链接器实现。
*
* ## macOS动态库说明
* - dylibmacOS标准动态库格式类似Linux的.so
* - frameworkmacOS特有的打包格式包含库、头文件和资源
* - bundle可加载的插件格式.bundle或.plugin
*
* ## POSIX API说明
* - dlopen加载动态库
* - dlclose卸载动态库
* - dlsym从动态库中获取符号地址
* - dlerror获取最后一次错误信息
*
* @note 使用.cpp扩展名不是.mm因为不需要Objective-C功能
* @note 与Linux实现基本相同但加载路径和搜索规则有所不同
*/
#include "lib_handle.h"
#include <dlfcn.h>
/**
* @brief 打开动态库macOS实现
*
* 使用POSIX标准的dlopen加载动态库.dylib、.framework或.bundle
*
* ## 实现细节
*
* ### macOS动态库类型
* 1. **dylib** - 标准动态库
* - 扩展名:.dylib
* - 位置:/usr/lib、/usr/local/lib等
* - 示例libMyLib.dylib
*
* 2. **Framework** - macOS特有格式
* - 位置:/System/Library/Frameworks、/Library/Frameworks
* - 结构MyFramework.framework/MyFramework
* - 包含:库、头文件、资源
*
* 3. **Bundle** - 可加载插件
* - 扩展名:.bundle、.plugin
* - 常用于插件系统
*
* ### dlopen标志说明
* - RTLD_LAZY延迟解析符号性能更好
* - 仅在符号首次使用时解析
* - 如果符号不存在,会在使用时才报错
*
* - RTLD_NOW立即解析所有符号
* - 加载时解析所有符号
* - 如果有未定义符号dlopen会失败
*
* - RTLD_LOCAL符号仅在本库内可见默认
* - RTLD_GLOBAL符号对后续加载的库可见
*
* ### macOS搜索路径
* dlopen按以下顺序搜索
* 1. @executable_path - 可执行文件所在目录
* 2. @loader_path - 加载库所在目录
* 3. @rpath - 运行时搜索路径
* 4. DYLD_LIBRARY_PATH环境变量如果设置
* 5. /usr/local/lib
* 6. /usr/lib
*
* ### 系统完整性保护SIP
* macOS 10.11+启用了SIP限制
* - DYLD_LIBRARY_PATH在受保护进程中被忽略
* - 无法修改系统库路径
* - 某些目录需要特殊权限
*
* ### 错误处理
* dlopen失败时返回nullptr使用dlerror()获取错误信息:
* - "image not found" - 库文件不存在
* - "no suitable image found" - 架构不匹配
* - "symbol not found" - 缺少符号RTLD_NOW模式
* - "Library not loaded" - 缺少依赖库
*
* @param lib_path 动态库路径(支持相对路径、绝对路径、@rpath等
* @return bool true表示成功false表示失败
*
* @note 会先调用close()关闭已打开的库
* @note 使用RTLD_LAZY以获得更好的性能
* @note 失败时可以调用dlerror()获取详细错误信息
*/
auto lib_handle::open(const std::filesystem::path& lib_path) -> bool {
close();
// 使用RTLD_LAZY延迟加载符号
// 在macOS上dlopen可以加载.dylib、.framework和.bundle
handle_ = dlopen(lib_path.c_str(), RTLD_LAZY);
return handle_ != nullptr;
}
/**
* @brief 关闭动态库macOS实现
*
* 使用POSIX标准的dlclose卸载动态库。
*
* ## 实现细节
*
* ### 引用计数
* - dlclose递减库的引用计数
* - 当引用计数降为0时库才会真正卸载
* - 如果库被多次dlopen需要相同次数的dlclose
*
* ### 析构函数
* 库卸载时会调用:
* - C++全局对象的析构函数
* - __attribute__((destructor))标记的函数
* - atexit()注册的清理函数
*
* ### macOS特性
* - 某些系统库可能无法卸载(返回错误但不影响程序)
* - Framework的卸载也会卸载其资源和依赖
* - 卸载时dyld会处理依赖关系
*
* ### 线程安全
* - dlclose是线程安全的
* - 但需要确保没有线程正在使用库中的代码
* - 正在执行的函数可能导致崩溃
*
* @note 调用后handle_会被设置为nullptr
* @note 重复调用是安全的会检查handle_是否为空
* @note 卸载失败时dlerror()会返回错误信息
*/
void lib_handle::close() {
if (handle_) {
dlclose(handle_);
handle_ = nullptr;
}
}
/**
* @brief 获取函数地址macOS实现
*
* 使用POSIX标准的dlsym从动态库中获取符号地址。
*
* ## 实现细节
*
* ### 符号查找
* dlsym可以查找
* - C函数直接使用函数名
* - C++函数需要使用extern "C"避免名称修饰
* - 全局变量:可以获取变量地址
* - 弱符号如果存在返回地址否则返回nullptr
*
* ### 名称修饰Name Mangling
* C++编译器会修饰函数名以支持重载:
* @code
* // C++函数
* int add(int a, int b); // 可能被修饰为 __Z3addii
*
* // 避免修饰
* extern "C" int add(int a, int b); // 保持为 add
* @endcode
*
* ### macOS符号约定
* - 前导下划线C符号通常有前导下划线但dlsym会自动处理
* - 隐藏符号使用__attribute__((visibility("hidden")))的符号无法查找
* - 弱符号使用__attribute__((weak))的符号可以被覆盖
*
* ### 错误处理
* dlsym失败返回nullptr常见原因
* - 符号不存在
* - 符号被标记为隐藏
* - 名称修饰不匹配
* - 库未正确导出符号
*
* ### 使用示例
* @code
* // 假设库中有extern "C" int calculate(int x);
* lib_handle lib;
* lib.open("libmath.dylib");
*
* // 获取函数指针
* auto func = lib.get_function<int(int)>("calculate");
* if (func) {
* int result = func(42);
* }
* @endcode
*
* @param func_name 符号名称C风格字符串
* @return void* 符号地址指针失败时返回nullptr
*
* @note 如果handle_为空直接返回nullptr
* @note 返回的指针需要转换为正确的类型才能使用
* @note 可以使用dlerror()获取失败原因
*/
auto lib_handle::get_raw_function(const std::string& func_name) -> void* {
if (!handle_) {
return nullptr;
}
// dlsym返回符号地址函数或变量
// 在macOS上dlsym会自动处理前导下划线
return dlsym(handle_, func_name.c_str());
}

View File

@@ -0,0 +1,130 @@
/**
* @file lib_handle.cpp
* @brief Windows平台动态库加载工具实现
*
* 实现了lib_handle.h中声明的跨平台动态库加载工具的Windows版本。
* 使用Windows APILoadLibrary、FreeLibrary、GetProcAddress来实现
* 动态库的加载、卸载和函数查找功能。
*
* ## Windows API说明
* - LoadLibrary加载动态链接库DLL
* - FreeLibrary卸载动态链接库
* - GetProcAddress从DLL中获取函数地址
*
* @note 仅在Windows平台编译
*/
#include "lib_handle.h"
#include <windows.h>
/**
* @brief 打开动态库Windows实现
*
* 使用Windows API LoadLibrary加载指定路径的DLL文件。
*
* ## 实现细节
*
* ### LoadLibrary行为
* - 如果DLL已经加载会增加其引用计数
* - 搜索顺序:
* 1. 应用程序目录
* 2. 系统目录System32
* 3. Windows目录
* 4. 当前目录
* 5. PATH环境变量中的目录
*
* ### 错误处理
* LoadLibrary失败时返回NULL可能的原因
* - 文件不存在
* - 不是有效的DLL文件
* - 缺少依赖的DLL
* - 架构不匹配32位/64位
* - 权限不足
*
* @param lib_path DLL文件的路径
* @return bool true表示成功false表示失败
*
* @note 会先调用close()关闭已打开的库
* @note 失败时可以使用GetLastError()获取详细错误代码
*/
auto lib_handle::open(const std::filesystem::path& lib_path) -> bool {
close();
// 使用LoadLibrary加载DLL
// lib_path.c_str()返回const char*在Windows上会自动转换为所需的类型
handle_ = LoadLibraryA(lib_path.string().c_str());
return handle_ != nullptr;
}
/**
* @brief 关闭动态库Windows实现
*
* 使用Windows API FreeLibrary卸载DLL。
*
* ## 实现细节
*
* ### 引用计数
* - FreeLibrary递减DLL的引用计数
* - 当引用计数降为0时DLL才会真正卸载
* - 如果DLL被多次LoadLibrary需要相同次数的FreeLibrary
*
* ### 线程安全
* - DLL的DllMain函数会在卸载时被调用DLL_PROCESS_DETACH
* - 需要确保没有其他线程正在使用DLL中的代码或数据
*
* @note 调用后handle_会被设置为nullptr
* @note 重复调用是安全的会检查handle_是否为空
*/
void lib_handle::close() {
if (handle_) {
FreeLibrary(static_cast<HMODULE>(handle_));
handle_ = nullptr;
}
}
/**
* @brief 获取函数地址Windows实现
*
* 使用Windows API GetProcAddress从DLL中获取导出函数的地址。
*
* ## 实现细节
*
* ### 函数查找
* GetProcAddress通过函数名查找
* - C函数直接使用函数名
* - C++函数需要使用extern "C"避免名称修饰name mangling
* - 导出序号:也可以使用序号而不是名称(不推荐)
*
* ### 名称修饰
* C++编译器会对函数名进行修饰,导致查找失败。解决方法:
* @code
* // 在DLL中声明函数时使用
* extern "C" __declspec(dllexport) int my_function(int x);
* @endcode
*
* ### 调用约定
* 需要确保函数的调用约定匹配:
* - __cdeclC默认调用约定
* - __stdcallWindows API调用约定
* - __fastcall快速调用约定
*
* @param func_name 函数名称C风格字符串
* @return void* 函数地址指针失败时返回nullptr
*
* @note 如果handle_为空直接返回nullptr
* @note 返回的指针需要转换为正确的函数指针类型才能调用
* @note 使用FARPROC类型表示函数指针然后转换为void*
*/
auto lib_handle::get_raw_function(const std::string& func_name) -> void* {
if (!handle_) {
return nullptr;
}
// GetProcAddress返回FARPROC类型函数指针
// FARPROC是Windows定义的通用函数指针类型
// 将其转换为void*以保持跨平台的接口一致性
return reinterpret_cast<void*>(
GetProcAddress(static_cast<HMODULE>(handle_), func_name.c_str())
);
}

View File

@@ -1,7 +1,7 @@
project(alicho_network)
find_package(Boost COMPONENTS interprocess date_time thread CONFIG REQUIRED)
find_package(zeromq REQUIRED)
find_package(cppzmq CONFIG REQUIRED)
find_package(cppzmq REQUIRED)
find_package(yalantinglibs CONFIG REQUIRED)

View File

@@ -88,7 +88,7 @@ namespace alicho {
process_error process_monitor::update_status() {
try {
auto previous_state = monitored_process_.state;
// auto previous_state = monitored_process_.state;
// 检查进程状态
bool is_running = check_process_running();

View File

@@ -1,4 +1,7 @@
project(alicho_simd)
simple_library(STATIC)
target_link_libraries(${PROJECT_NAME} PUBLIC alicho_misc)
add_subdirectory(simd_interface)
add_subdirectory(simd_scaler)
add_subdirectory(simd_sse)
add_subdirectory(simd_avx)
add_subdirectory(simd_avx512)
add_subdirectory(misc)

View File

@@ -1,158 +0,0 @@
/**
* @file simd_audio_processing.cpp
* @brief SIMD音频处理函数注册模块实现
*
* 本文件负责将所有音频处理函数标量和SIMD优化版本注册到SIMD函数调度器中。
* 注册过程采用分层策略:
* 1. 根据平台x86或ARM选择合适的SIMD实现
* 2. 为每个SIMD指令集版本注册对应的函数实现
* 3. 运行时根据CPU特性自动选择最优实现
*
* 注册流程:
* - 标量实现:所有平台通用的基准实现
* - x86平台SSE/SSE3/SSE4/AVX/AVX2/AVX512优化实现
* - ARM平台NEON优化实现
*/
#include "simd_audio_processing.h"
#include "scalar_audio_processing_func.h"
#include "simd_func_dispatcher.h"
#include "x86_simd_audio_processing_func.h"
#include "arm_simd_audio_processing_func.h"
/**
* @brief x86平台自动注册宏 - 注册所有x86 SIMD版本
*
* 此宏为指定函数注册多个SIMD实现版本
* - SCALAR: 标量实现(所有平台通用的基准版本)
* - SSE/SSE3/SSE4: 使用相同的SSE实现128位向量处理4个float
* - AVX/AVX2: 使用相同的AVX实现256位向量处理8个float
* - AVX512: 最新的AVX-512实现512位向量处理16个float
*
* 注册顺序说明:
* 1. 先注册标量版本作为后备实现
* 2. 按指令集从旧到新注册SIMD版本
* 3. 运行时调度器会根据CPU特性选择最优版本
*
* 注意SSE/SSE3/SSE4共用同一实现是因为这些指令集间差异不影响音频处理性能
*/
#if ALICHO_PLATFORM_X86
#define AUTO_REGISTER_SIMD_FUNCTION(func_name)\
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::SCALAR, scalar_audio_processing_func::func_name); \
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::SSE, x86_simd_audio_processing_func::func_name##_sse); \
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::SSE3, x86_simd_audio_processing_func::func_name##_sse); \
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::SSE4, x86_simd_audio_processing_func::func_name##_sse); \
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::AVX, x86_simd_audio_processing_func::func_name##_avx); \
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::AVX2, x86_simd_audio_processing_func::func_name##_avx); \
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::AVX512, x86_simd_audio_processing_func::func_name##_avx512);
/**
* @brief ARM平台自动注册宏 - 注册标量和NEON版本
*
* ARM平台注册策略
* - SCALAR: 标量实现(所有平台通用)
* - NEON: ARM的SIMD指令集实现128位向量处理4个float
*
* 注册顺序说明:
* 1. 先注册标量版本作为后备实现
* 2. 注册NEON优化版本大多数现代ARM处理器都支持
* 3. 运行时根据CPU是否支持NEON自动选择
*/
#elif ALICHO_PLATFORM_ARM
#define AUTO_REGISTER_SIMD_FUNCTION(func_name) \
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::SCALAR, scalar_audio_processing_func::func_name); \
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::NEON, neon_simd_audio_processing_func::func_name##_neon);
#endif
/**
* @brief 强制使用标量实现的注册宏
*
* 某些函数可能由于以下原因只使用标量实现:
* 1. 算法特性不适合SIMD优化如分支过多
* 2. SIMD实现收益不明显甚至可能降低性能
* 3. 实现复杂度过高,维护成本超过性能收益
*
* 此宏将标量实现注册到所有SIMD版本槽位确保
* - 无论CPU支持何种指令集都使用相同的标量实现
* - 避免因缺少SIMD实现导致的运行时错误
* - 保持API一致性调用方无需关心实现细节
*
* 当前使用此宏的函数:
* - apply_gain: 虽然可以SIMD优化但此处暂时使用标量版本
*/
#define FORCE_SCALAR_SIMD_FUNCTION(func_name) \
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::SCALAR, scalar_audio_processing_func::func_name); \
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::SSE, scalar_audio_processing_func::func_name); \
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::SSE3, scalar_audio_processing_func::func_name); \
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::SSE4, scalar_audio_processing_func::func_name); \
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::AVX, scalar_audio_processing_func::func_name); \
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::AVX2, scalar_audio_processing_func::func_name); \
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::AVX512, scalar_audio_processing_func::func_name); \
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::NEON, scalar_audio_processing_func::func_name);
/**
* @brief 注册所有音频处理函数到SIMD调度器
*
* 此函数在程序初始化时调用,负责:
* 1. 将所有音频处理函数的各个SIMD版本注册到调度器
* 2. 建立函数名到实现的映射关系
* 3. 为运行时动态分发做准备
*
* 注册的函数列表:
* - mix_audio: 音频混合(两路音频相加)
* - apply_gain: 音量增益调节(暂时使用标量版本)
* - calculate_rms: RMS电平计算均方根值
* - calculate_peak: 峰值电平检测
* - normalize_audio: 音频归一化处理
* - stereo_to_mono: 立体声转单声道
* - limit_audio: 音频限幅器(动态范围压缩)
* - fade_audio: 淡入淡出效果
* - simple_eq: 简单三段均衡器(低频/中频/高频)
*
* 注册顺序考虑:
* - 按功能类型分组:基础操作 -> 分析 -> 效果处理
* - 简单函数在前,复杂函数在后
* - 便于理解和维护
*
* 错误处理:
* - REGISTER_SIMD_FUNCTION宏内部会处理重复注册
* - 如果函数不存在会在编译时报错(类型安全)
*/
void audio_processing_registry::register_all_functions() {
AUTO_REGISTER_SIMD_FUNCTION(mix_audio); // 音频混合
FORCE_SCALAR_SIMD_FUNCTION(apply_gain); // 增益调节(标量版本)
AUTO_REGISTER_SIMD_FUNCTION(calculate_rms); // RMS计算
AUTO_REGISTER_SIMD_FUNCTION(calculate_peak); // 峰值检测
AUTO_REGISTER_SIMD_FUNCTION(normalize_audio); // 归一化
AUTO_REGISTER_SIMD_FUNCTION(stereo_to_mono); // 立体声转单声道
AUTO_REGISTER_SIMD_FUNCTION(limit_audio); // 限幅器
AUTO_REGISTER_SIMD_FUNCTION(fade_audio); // 淡入淡出
AUTO_REGISTER_SIMD_FUNCTION(simple_eq); // 简单均衡器
}
/**
* @brief 打印所有已注册函数的状态信息
*
* 此函数用于调试和诊断,输出内容包括:
* 1. 已注册的函数名称列表
* 2. 每个函数可用的SIMD版本
* 3. 当前CPU支持的指令集
* 4. 运行时将使用的具体实现版本
*
* 使用场景:
* - 开发调试:验证函数是否正确注册
* - 性能分析确认使用了最优的SIMD版本
* - 问题诊断检查CPU特性检测是否正常
* - 用户支持:提供系统配置信息
*
* 输出示例:
* "Function: mix_audio
* - SCALAR: available
* - SSE: available
* - AVX: available (selected)
* - AVX512: not available (CPU not supported)"
*/
void audio_processing_registry::print_available_functions() {
simd_func_dispatcher::instance().print_registry_status();
}

View File

@@ -1,146 +0,0 @@
/**
* @file simd_audio_processing.h
* @brief SIMD音频处理函数注册器 - 音频处理功能的统一注册和管理入口
*
* 本文件定义了音频处理函数的注册器类,负责将所有音频处理函数
* 包括标量版本和各种SIMD优化版本注册到函数分发器中。
*
* 核心职责:
* 1. **函数注册**将所有音频处理函数注册到SIMD函数分发器
* 2. **平台适配**根CPU特性选择合适的SIMD实现
* 3. **性能优化**:确保运行时使用最优的函数实现
* 4. **调试支持**:提供可用函数列表打印功能
*
* 与函数分发器的关系:
* ```
* simd_func_dispatcher (分发器)
* ↑
* | 注册
* |
* audio_processing_registry (本文件)
* |
* | 包含
* ↓
* 标量版本 + SSE + AVX + AVX512 + NEON (实现)
* ```
*
* 支持的音频处理功能:
* - 音频混合 (mix_audio)
* - 增益控制 (apply_gain)
* - RMS计算 (calculate_rms)
* - 峰值检测 (calculate_peak)
* - 音频归一化 (normalize_audio)
* - 立体声转单声道 (stereo_to_mono)
* - 音频限幅 (limit_audio)
* - 淡入淡出 (fade_audio)
* - 三段均衡器 (simple_eq)
*
* 使用方式:
* ```cpp
* // 在程序启动时调用一次
* audio_processing_registry::register_all_functions();
*
* // 调试时查看可用函数
* audio_processing_registry::print_available_functions();
*
* // 之后通过函数分发器使用
* auto func = simd_func_dispatcher::get_function<...>("mix_audio");
* ```
*
* @note 此类只包含静态方法,不需要实例化
* @see simd_func_dispatcher.h 函数分发器的定义
* @see scalar_audio_processing_func.h 标量实现
* @see x86_simd_audio_processing_func.h x86 SIMD实现
* @see arm_simd_audio_processing_func.h ARM NEON实现
*/
#pragma once
/**
* @class audio_processing_registry
* @brief 音频处理函数注册器
*
* 负责将所有音频处理函数注册到SIMD函数分发器中。
* 注册过程会根据当前CPU的特性自动选择最优的实现版本。
*
* 注册顺序和优先级:
* 1. 首先注册标量版本(保底实现,所有平台可用)
* 2. 然后注册SIMD版本如果CPU支持
* - x86平台SSE -> AVX -> AVX512按性能递增
* - ARM平台NEON
*
* 函数分发器会根据注册顺序,优先使用后注册的高性能版本。
*
* 线程安全性:
* - register_all_functions() 应该在程序启动时调用一次
* - 不是线程安全的,不应该并发调用
* - 注册完成后,使用函数是线程安全的
*/
class audio_processing_registry {
public:
/**
* @brief 注册所有音频处理函数
*
* 将所有支持的音频处理函数注册到SIMD函数分发器中。
* 此函数会检测当前CPU特性并注册所有兼容的实现版本。
*
* 注册的函数包括:
* - 音频信号处理mix_audio, apply_gain
* - 音频分析calculate_rms, calculate_peak
* - 音频处理效果normalize_audio, stereo_to_mono, limit_audio, fade_audio, simple_eq
*
* 每个函数都会注册多个版本如果CPU支持
* - 标量版本(必定存在)
* - SSE版本x86平台如果支持
* - AVX版本x86平台如果支持
* - AVX512版本x86平台如果支持
* - NEON版本ARM平台如果支持
*
* @note 应该在程序启动早期调用,只需调用一次
* @note 不是线程安全的,不应并发调用
* @warning 重复调用可能导致重复注册
*
* 使用示例:
* ```cpp
* int main() {
* // 初始化阶段
* audio_processing_registry::register_all_functions();
*
* // 后续使用
* auto mix_func = simd_func_dispatcher::get_function<...>("mix_audio");
* mix_func(src1, src2, dst, samples);
* return 0;
* }
* ```
*/
static void register_all_functions();
/**
* @brief 打印所有可用的音频处理函数
*
* 输出所有已注册的音频处理函数及其实现版本。
* 用于调试和验证函数注册是否成功。
*
* 输出格式示例:
* ```
* Available audio processing functions:
* mix_audio:
* - scalar (baseline)
* - sse (4x SIMD)
* - avx (8x SIMD)
* apply_gain:
* - scalar (baseline)
* - neon (ARM SIMD)
* ...
* ```
*
* 应用场景:
* - 验证SIMD函数是否正确注册
* - 检查当前平台支持哪些优化版本
* - 性能调试和分析
*
* @note 此函数仅用于调试,不影响程序功能
* @note 输出会打印到标准输出或日志系统
*/
static void print_available_functions();
};

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,8 @@
project(alicho_simd)
simple_library(STATIC)
target_link_libraries(${PROJECT_NAME} PUBLIC alicho_misc alicho_simd_interface)
if(UNIX AND NOT APPLE)
target_link_libraries(${PROJECT_NAME} PUBLIC dl)
endif()

View File

@@ -31,6 +31,7 @@
#pragma once
#include <cstdint>
#include <limits>
#include <cstddef>
/**
* @namespace scalar_audio_processing_func

View File

@@ -87,6 +87,7 @@
*/
#pragma once
#include <cstddef>
#if ALICHO_PLATFORM_X86
/**

View File

@@ -293,9 +293,9 @@ void cpu_feature_detector::detect_x86_features() {
if ((cpuid_7.ebx & (1 << 30)) != 0) { info_.features |= static_cast<uint32_t>(cpu_feature::AVX512BW); } // bit 30: AVX-512 Byte/Word字节/字操作)
if ((cpuid_7.ebx & (1 << 17)) != 0) { info_.features |= static_cast<uint32_t>(cpu_feature::AVX512DQ); } // bit 17: AVX-512 DQ双字/四字操作)
// 从CPUID.7.0.ECX寄存提取更多AVX-512扩展特性
// 从CPUID.7.0.ECX寄存提取更多AVX-512扩展特性
if ((cpuid_7.ecx & (1 << 21)) != 0) { info_.features |= static_cast<uint32_t>(cpu_feature::AVX512IFMA); } // bit 21: AVX-512 IFMA整数融合乘加
if ((cpuid_7.ecx & (1 << 2)) != 0) { info_.features |= static_cast<uint32_t>(cpu_feature::AVX512VBMI); } // bit 1: AVX-512 VBMI向量字节操作
if ((cpuid_7.ecx & (1 << 1)) != 0) { info_.features |= static_cast<uint32_t>(cpu_feature::AVX512VBMI); } // bit 1: AVX-512 VBMI向量字节操作
// ========================================================================
// 步骤4: 确定最高可用的SIMD级别

95
src/simd/misc/simd_api.h Normal file
View File

@@ -0,0 +1,95 @@
#pragma once
#include "simd_func_dispatcher.h"
namespace simd {
// ============================================================================
// SIMD函数的零开销包装接口
//
// 这些inline函数会被编译器优化为直接的函数指针调用
// 实现零开销的SIMD函数调度
// ============================================================================
inline void fill_buffer(float* buffer, float value, size_t num_samples) {
simd_func_dispatcher::instance().get_fill_buffer()(buffer, value, num_samples);
}
inline void mix_audio(const float* src1, const float* src2,
float* dst, size_t num_samples) {
simd_func_dispatcher::instance().get_mix_audio()(src1, src2, dst, num_samples);
}
inline void apply_gain(const float* src, float* dst,
float gain, size_t num_samples) {
simd_func_dispatcher::instance().get_apply_gain()(src, dst, gain, num_samples);
}
inline float calculate_rms(const float* src, size_t num_samples) {
return simd_func_dispatcher::instance().get_calculate_rms()(src, num_samples);
}
inline float calculate_peak(const float* src, size_t num_samples) {
return simd_func_dispatcher::instance().get_calculate_peak()(src, num_samples);
}
inline void normalize_audio(const float* src, float* dst,
float target_peak, size_t num_samples) {
simd_func_dispatcher::instance().get_normalize_audio()(
src, dst, target_peak, num_samples);
}
inline void stereo_to_mono(const float* stereo_src, float* mono_dst,
size_t num_stereo_samples) {
simd_func_dispatcher::instance().get_stereo_to_mono()(
stereo_src, mono_dst, num_stereo_samples);
}
inline void limit_audio(const float* src, float* dst, float threshold,
float* limiter_state, float sample_rate, size_t num_samples) {
simd_func_dispatcher::instance().get_limit_audio()(
src, dst, threshold, limiter_state, sample_rate, num_samples);
}
inline void fade_audio(const float* src, float* dst, size_t fade_in_samples,
size_t fade_out_samples, size_t num_samples) {
simd_func_dispatcher::instance().get_fade_audio()(
src, dst, fade_in_samples, fade_out_samples, num_samples);
}
inline void simple_eq(const float* src, float* dst, float low_gain,
float mid_gain, float high_gain, float* eq_state,
size_t num_samples) {
simd_func_dispatcher::instance().get_simple_eq()(
src, dst, low_gain, mid_gain, high_gain, eq_state, num_samples);
}
// ============================================================================
// 调试和信息接口
// ============================================================================
/// 获取当前激活的SIMD版本
/// @return 当前使用的SIMD指令集版本
inline auto get_active_simd_version() -> simd_func_version {
return simd_func_dispatcher::instance().get_active_version();
}
/// 获取当前激活的SIMD版本的字符串表示
/// @return 版本名称(如 "AVX2", "NEON", "SCALAR" 等)
inline auto get_active_simd_version_string() -> std::string {
auto version = get_active_simd_version();
switch (version) {
case simd_func_version::SCALAR: return "SCALAR";
case simd_func_version::SSE: return "SSE";
case simd_func_version::SSE3: return "SSE3";
case simd_func_version::SSE4: return "SSE4";
case simd_func_version::AVX: return "AVX";
case simd_func_version::AVX2: return "AVX2";
case simd_func_version::AVX512: return "AVX512";
case simd_func_version::NEON: return "NEON";
case simd_func_version::NEON_FP16: return "NEON_FP16";
default: return "UNKNOWN";
}
}
} // namespace simd

View File

@@ -0,0 +1,201 @@
#include "simd_func_dispatcher.h"
#include "cpu_features.h"
#include "logger.h"
#include <string_view>
#define SIMD_FUNC_DISPATCHER_LOG_MODULE "simd_func_dispatcher"
// 将 simd_level 映射到 simd_func_version
static std::string_view version_to_lib_name(simd_func_version version) {
switch (version) {
case simd_func_version::SCALAR: return "scaler";
case simd_func_version::SSE: return "sse";
case simd_func_version::AVX: return "avx";
case simd_func_version::AVX2: return "avx2";
case simd_func_version::AVX512: return "avx512";
case simd_func_version::NEON: return "neon";
case simd_func_version::NEON_FP16: return "neon_fp16";
default: return "";
}
}
simd_func_dispatcher::simd_func_dispatcher() {
for (int i = 0; i < static_cast<int>(simd_func_version::COUNT); ++i) {
auto version = static_cast<simd_func_version>(i);
auto lib_suffix = version_to_lib_name(version);
if (lib_suffix.empty()) {
continue;
}
std::string lib_name;
#if ALICHO_PLATFORM_WINDOWS
lib_name = "alicho_simd_" + std::string(lib_suffix) + ".dll";
#elif ALICHO_PLATFORM_LINUX
lib_name = "./libalicho_simd_" + std::string(lib_suffix) + ".so";
#elif ALICHO_PLATFORM_APPLE
lib_name = "./libalicho_simd_" + std::string(lib_suffix) + ".dylib";
#endif
auto handle = std::make_unique<lib_handle>();
if (handle->open(lib_name)) {
log_module_info(SIMD_FUNC_DISPATCHER_LOG_MODULE, "Successfully loaded SIMD library: {}", lib_name);
loaded_libraries_[version] = std::move(handle);
} else {
log_module_debug(SIMD_FUNC_DISPATCHER_LOG_MODULE, "Could not find or load SIMD library: {}", lib_name);
}
}
// 初始化函数指针
initialize_function_pointers();
}
simd_func_dispatcher::~simd_func_dispatcher() = default;
// 初始化函数指针
void simd_func_dispatcher::initialize_function_pointers() {
// 1. 检测CPU能力
const auto& cpu_info = get_cpu_info();
auto recommended_level = get_recommended_simd_level();
auto preferred_version = simd_level_to_version(recommended_level);
log_module_info(SIMD_FUNC_DISPATCHER_LOG_MODULE, "检测到CPU最高SIMD级别: {}", static_cast<int>(cpu_info.max_simd_level));
log_module_info(SIMD_FUNC_DISPATCHER_LOG_MODULE, "推荐使用SIMD级别: {}", static_cast<int>(recommended_level));
// 2. 找到可用版本
auto target_version = find_fallback_version(preferred_version);
log_module_info(SIMD_FUNC_DISPATCHER_LOG_MODULE, "选择的SIMD版本: {}", static_cast<int>(target_version));
// 3. 加载函数指针
if (!try_load_functions(target_version)) {
throw std::runtime_error("Failed to load SIMD functions for any available version");
}
active_version_ = target_version;
log_module_info(SIMD_FUNC_DISPATCHER_LOG_MODULE, "成功初始化SIMD函数调度器激活版本: {}", static_cast<int>(active_version_));
}
// 回退策略
auto simd_func_dispatcher::find_fallback_version(simd_func_version preferred) -> simd_func_version {
// 定义回退序列
#if ALICHO_PLATFORM_X86
// x86/x64 回退序列:从高到低
static const simd_func_version x86_fallback[] = {
simd_func_version::AVX512,
simd_func_version::AVX2,
simd_func_version::AVX,
simd_func_version::SSE,
simd_func_version::SCALAR
};
// 从首选版本开始查找
bool found_preferred = false;
for (auto version : x86_fallback) {
if (version == preferred) {
found_preferred = true;
}
if (found_preferred && loaded_libraries_.count(version) > 0) {
log_module_info(SIMD_FUNC_DISPATCHER_LOG_MODULE, "找到可用的SIMD版本: {}", static_cast<int>(version));
return version;
}
}
#elif ALICHO_PLATFORM_ARM
// ARM 回退序列
static const simd_func_version arm_fallback[] = {
simd_func_version::NEON_FP16,
simd_func_version::NEON,
simd_func_version::SCALAR
};
bool found_preferred = false;
for (auto version : arm_fallback) {
if (version == preferred) {
found_preferred = true;
}
if (found_preferred && loaded_libraries_.count(version) > 0) {
log_module_info(SIMD_FUNC_DISPATCHER_LOG_MODULE, "找到可用的SIMD版本: {}", static_cast<int>(version));
return version;
}
}
#endif
// 最后回退到标量版本
if (loaded_libraries_.count(simd_func_version::SCALAR) > 0) {
log_module_info(SIMD_FUNC_DISPATCHER_LOG_MODULE, "回退到标量版本");
return simd_func_version::SCALAR;
}
throw std::runtime_error("No SIMD library available, not even scalar version");
}
// 尝试从指定版本加载函数
auto simd_func_dispatcher::try_load_functions(simd_func_version version) -> bool {
// 检查库是否已加载
auto it = loaded_libraries_.find(version);
if (it == loaded_libraries_.end()) {
log_module_error(SIMD_FUNC_DISPATCHER_LOG_MODULE, "SIMD库版本 {} 未加载", static_cast<int>(version));
return false;
}
auto* handle = it->second.get();
// 加载所有10个函数
fill_buffer_ = load_function<fill_buffer_t>(handle, "fill_buffer");
mix_audio_ = load_function<mix_audio_t>(handle, "mix_audio");
apply_gain_ = load_function<apply_gain_t>(handle, "apply_gain");
calculate_rms_ = load_function<calculate_rms_t>(handle, "calculate_rms");
calculate_peak_ = load_function<calculate_peak_t>(handle, "calculate_peak");
normalize_audio_ = load_function<normalize_audio_t>(handle, "normalize_audio");
stereo_to_mono_ = load_function<stereo_to_mono_t>(handle, "stereo_to_mono");
limit_audio_ = load_function<limit_audio_t>(handle, "limit_audio");
fade_audio_ = load_function<fade_audio_t>(handle, "fade_audio");
simple_eq_ = load_function<simple_eq_t>(handle, "simple_eq");
// 检查是否所有函数都加载成功
if (!fill_buffer_ || !mix_audio_ || !apply_gain_ || !calculate_rms_ ||
!calculate_peak_ || !normalize_audio_ || !stereo_to_mono_ ||
!limit_audio_ || !fade_audio_ || !simple_eq_) {
log_module_error(SIMD_FUNC_DISPATCHER_LOG_MODULE, "加载SIMD函数失败版本: {}", static_cast<int>(version));
return false;
}
log_module_info(SIMD_FUNC_DISPATCHER_LOG_MODULE, "成功加载所有SIMD函数版本: {}", static_cast<int>(version));
return true;
}
// 从lib_handle加载单个函数
template<typename FuncT>
auto simd_func_dispatcher::load_function(lib_handle* handle, const std::string& name) -> FuncT {
if (!handle) {
log_module_error(SIMD_FUNC_DISPATCHER_LOG_MODULE, "lib_handle 为空,无法加载函数: {}", name);
return nullptr;
}
// FuncT 现在已经是函数指针类型 (如 void(*)(float*, float, size_t))
// 移除指针得到函数签名类型 (如 void(float*, float, size_t))
using func_signature = std::remove_pointer_t<FuncT>;
// 调用 lib_handle::get_function_by_name() 获取 std::function
auto std_func = handle->get_function_by_name<func_signature>(name);
if (!std_func) {
log_module_error(SIMD_FUNC_DISPATCHER_LOG_MODULE, "加载函数失败: {}", name);
return nullptr;
}
// 从 std::function 获取底层函数指针
// std::function::target<T>() 返回指向目标可调用对象的指针
auto* func_ptr = std_func.template target<FuncT>();
if (!func_ptr || !*func_ptr) {
log_module_error(SIMD_FUNC_DISPATCHER_LOG_MODULE, "转换函数指针失败: {}", name);
return nullptr;
}
log_module_debug(SIMD_FUNC_DISPATCHER_LOG_MODULE, "成功加载函数: {}", name);
return *func_ptr;
}

View File

@@ -0,0 +1,223 @@
#pragma once
#include <algorithm>
#include <array>
#include <functional>
#include <memory>
#include <stdexcept>
#include <unordered_map>
#include "lib_handle.h"
#include "cpu_features.h"
#include "simd_interface.h"
/**
* @enum simd_func_version
* @brief SIMD函数版本枚举 - 定义所有可能的函数实现版本
*
* 该枚举定义了函数可以有的所有SIMD优化版本。
* 每个版本对应一个特定的SIMD指令集级别。
*
* 版本排序:
* - 枚举值从低到高表示性能从弱到强
* - SCALAR是最基础的版本所有CPU都支持
* - COUNT用于数组大小不是实际版本
*
* 与simd_level的关系
* - simd_level表示CPU的能力级别
* - simd_func_version表示函数的实现版本
* - 通过simd_level_to_version()进行转换
*
* @note 不是所有函数都需要实现所有版本
* @see simd_level, simd_level_to_version()
*/
enum class simd_func_version {
/** 标量实现 - 纯C++代码无SIMD优化
* - 兼容性所有CPU
* - 性能基准性能1x
* - 用途:最低保底实现、参考实现
* - 必要性:强制要求,作为回退版本
*/
SCALAR = 0,
/** SSE实现 - 使用SSE/SSE2指令
* - 兼容性2003年后的所有x86/x64
* - 向量宽度128位
* - 性能提升约2-4倍
*/
SSE,
/** SSE3实现 - 使用SSE3/SSSE3指令
* - 兼容性2006年后的主流CPU
* - 新增功能:水平运算、复数支持
* - 性能提升比SSE快10-20%
*/
SSE3,
/** SSE4实现 - 使用SSE4.1/SSE4.2指令
* - 兼容性2008年后的主流CPU
* - 新增功能点积、blend、字符串处理
* - 性能提升比SSE3快15-30%
*/
SSE4,
/** AVX实现 - 使用AVX指令
* - 兼容性2011年后的主流CPU
* - 向量宽度256位
* - 性能提升约2倍SSE4性能
*/
AVX,
/** AVX2实现 - 使用AVX2 + FMA指令
* - 兼容性2013年后的主流CPU
* - 新增功能完整256位整数运算、FMA
* - 性能提升比AVX快50-100%
* - 推荐:当前最佳性能/兼容性平衡点
*/
AVX2,
/** AVX-512实现 - 使用AVX-512指令集
* - 兼容性2016年后的高端CPU
* - 向量宽度512位
* - 性能提升约2倍AVX2性能理论
* - 注意可能导致CPU降频
*/
AVX512,
/** NEON实现 - 使用ARM NEON指令
* - 兼容性所有ARMv8-A (64位ARM)
* - 向量宽度128位
* - 性能与SSE4相当
* - 应用移动设备、Apple Silicon
*/
NEON,
/** NEON + FP16实现 - 使用NEON半精度浮点
* - 兼容性ARMv8.2-A及更新
* - 新增硬件FP16运算
* - 性能FP16运算快2倍
* - 应用移动端AI推理
*/
NEON_FP16,
/** RISC-V向量扩展实现
* - 兼容性支持RVV的RISC-V处理器
* - 特点:可变向量长度
* - 应用嵌入式、IoT
*/
VECTOR,
/** 版本数量标记
* 用于数组大小定义,不是实际的函数版本
*/
COUNT
};
/**
* @brief 将SIMD级别转换为函数版本
* @param level CPU的SIMD级别
* @return 对应的函数版本枚举值
*
* 将cpu_feature_detector检测到的SIMD级别转换为
* 函数调度器使用的版本标识。
*
* 映射关系:
* - simd_level::NONE -> simd_func_version::SCALAR
* - simd_level::SSE -> simd_func_version::SSE
* - simd_level::AVX2 -> simd_func_version::AVX2
* - 等等...
*
* @note constexpr函数编译时求值零运行时开销
* @see simd_level, simd_func_version
*/
constexpr auto simd_level_to_version(simd_level level) {
switch (level) {
case simd_level::NONE:
return simd_func_version::SCALAR;
case simd_level::SSE:
return simd_func_version::SSE;
case simd_level::SSE3:
return simd_func_version::SSE;
case simd_level::SSE4:
return simd_func_version::SSE;
case simd_level::AVX:
return simd_func_version::AVX;
case simd_level::AVX2:
return simd_func_version::AVX2;
case simd_level::AVX512:
return simd_func_version::AVX512;
case simd_level::NEON:
return simd_func_version::NEON;
case simd_level::NEON_FP16:
return simd_func_version::NEON_FP16;
}
// 默认回退到标量版本
return simd_func_version::SCALAR;
}
class simd_func_dispatcher : public lazy_singleton<simd_func_dispatcher> {
public:
friend class lazy_singleton<simd_func_dispatcher>;
// 函数签名类型定义(使用 decltype 从 simd_interface.h 推导)
using fill_buffer_t = decltype(&fill_buffer);
using mix_audio_t = decltype(&mix_audio);
using apply_gain_t = decltype(&apply_gain);
using calculate_rms_t = decltype(&calculate_rms);
using calculate_peak_t = decltype(&calculate_peak);
using normalize_audio_t = decltype(&normalize_audio);
using stereo_to_mono_t = decltype(&stereo_to_mono);
using limit_audio_t = decltype(&limit_audio);
using fade_audio_t = decltype(&fade_audio);
using simple_eq_t = decltype(&simple_eq);
// 获取函数指针的接口
[[nodiscard]] auto get_fill_buffer() const noexcept -> fill_buffer_t { return fill_buffer_; }
[[nodiscard]] auto get_mix_audio() const noexcept -> mix_audio_t { return mix_audio_; }
[[nodiscard]] auto get_apply_gain() const noexcept -> apply_gain_t { return apply_gain_; }
[[nodiscard]] auto get_calculate_rms() const noexcept -> calculate_rms_t { return calculate_rms_; }
[[nodiscard]] auto get_calculate_peak() const noexcept -> calculate_peak_t { return calculate_peak_; }
[[nodiscard]] auto get_normalize_audio() const noexcept -> normalize_audio_t { return normalize_audio_; }
[[nodiscard]] auto get_stereo_to_mono() const noexcept -> stereo_to_mono_t { return stereo_to_mono_; }
[[nodiscard]] auto get_limit_audio() const noexcept -> limit_audio_t { return limit_audio_; }
[[nodiscard]] auto get_fade_audio() const noexcept -> fade_audio_t { return fade_audio_; }
[[nodiscard]] auto get_simple_eq() const noexcept -> simple_eq_t { return simple_eq_; }
[[nodiscard]] auto get_active_version() const noexcept -> simd_func_version { return active_version_; }
protected:
simd_func_dispatcher();
~simd_func_dispatcher() override;
private:
// 初始化函数指针
void initialize_function_pointers();
// 尝试从指定版本加载函数
auto try_load_functions(simd_func_version version) -> bool;
// 回退策略
auto find_fallback_version(simd_func_version preferred) -> simd_func_version;
// 从lib_handle加载单个函数
template<typename FuncT>
auto load_function(lib_handle* handle, const std::string& name) -> FuncT;
// 已加载的库映射
std::unordered_map<simd_func_version, std::unique_ptr<lib_handle>> loaded_libraries_;
// 缓存的函数指针
fill_buffer_t fill_buffer_ = nullptr;
mix_audio_t mix_audio_ = nullptr;
apply_gain_t apply_gain_ = nullptr;
calculate_rms_t calculate_rms_ = nullptr;
calculate_peak_t calculate_peak_ = nullptr;
normalize_audio_t normalize_audio_ = nullptr;
stereo_to_mono_t stereo_to_mono_ = nullptr;
limit_audio_t limit_audio_ = nullptr;
fade_audio_t fade_audio_ = nullptr;
simple_eq_t simple_eq_ = nullptr;
// 当前激活的版本
simd_func_version active_version_ = simd_func_version::SCALAR;
};

View File

@@ -0,0 +1,7 @@
project(alicho_simd_avx)
if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64")
simple_library(SHARED)
target_compile_options(${PROJECT_NAME} PRIVATE -mavx2)
target_link_libraries(${PROJECT_NAME} PUBLIC alicho_simd_interface)
endif()

View File

@@ -0,0 +1,761 @@
/**
* @file x86_avx_audio_processing_func.cpp
* @brief x86 AVX音频处理函数实现
*/
#include "simd_interface.h"
#include <cmath>
#include <immintrin.h>
#include "aligned_allocator.h"
#if ALICHO_PLATFORM_X86
extern "C"
{
SIMD_EXPORT void fill_buffer(float *buffer, float value, size_t num_samples)
{
ASSERT_ALIGNED(buffer, ALIGNMENT_AVX);
constexpr size_t simd_width = 8;
constexpr size_t unroll_factor = 4;
size_t i = 0;
auto value_vec = _mm256_set1_ps(value);
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
{
_mm256_store_ps(&buffer[i], value_vec);
_mm256_store_ps(&buffer[i + 8], value_vec);
_mm256_store_ps(&buffer[i + 16], value_vec);
_mm256_store_ps(&buffer[i + 24], value_vec);
}
for (; i + simd_width <= num_samples; i += simd_width)
{
_mm256_store_ps(&buffer[i], value_vec);
}
for (; i < num_samples; ++i)
{
buffer[i] = value;
}
}
SIMD_EXPORT void mix_audio(const float *src1, const float *src2, float *dst, size_t num_samples)
{
ASSERT_ALIGNED(src1, ALIGNMENT_AVX);
ASSERT_ALIGNED(src2, ALIGNMENT_AVX);
ASSERT_ALIGNED(dst, ALIGNMENT_AVX);
constexpr size_t simd_width = 8;
constexpr size_t unroll_factor = 4;
size_t i = 0;
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
{
auto a0 = _mm256_load_ps(&src1[i]);
auto a1 = _mm256_load_ps(&src1[i + 8]);
auto a2 = _mm256_load_ps(&src1[i + 16]);
auto a3 = _mm256_load_ps(&src1[i + 24]);
auto b0 = _mm256_load_ps(&src2[i]);
auto b1 = _mm256_load_ps(&src2[i + 8]);
auto b2 = _mm256_load_ps(&src2[i + 16]);
auto b3 = _mm256_load_ps(&src2[i + 24]);
auto result0 = _mm256_add_ps(a0, b0);
auto result1 = _mm256_add_ps(a1, b1);
auto result2 = _mm256_add_ps(a2, b2);
auto result3 = _mm256_add_ps(a3, b3);
_mm256_store_ps(&dst[i], result0);
_mm256_store_ps(&dst[i + 8], result1);
_mm256_store_ps(&dst[i + 16], result2);
_mm256_store_ps(&dst[i + 24], result3);
}
for (; i + simd_width <= num_samples; i += simd_width)
{
auto a = _mm256_load_ps(&src1[i]);
auto b = _mm256_load_ps(&src2[i]);
auto result = _mm256_add_ps(a, b);
_mm256_store_ps(&dst[i], result);
}
for (; i < num_samples; ++i)
{
dst[i] = src1[i] + src2[i];
}
}
SIMD_EXPORT void apply_gain(const float *src, float *dst, float gain, size_t num_samples)
{
ASSERT_ALIGNED(src, ALIGNMENT_AVX);
ASSERT_ALIGNED(dst, ALIGNMENT_AVX);
constexpr size_t simd_width = 8;
constexpr size_t unroll_factor = 4;
size_t i = 0;
auto gain_vec = _mm256_set1_ps(gain);
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
{
auto a0 = _mm256_load_ps(&src[i]);
auto a1 = _mm256_load_ps(&src[i + 8]);
auto a2 = _mm256_load_ps(&src[i + 16]);
auto a3 = _mm256_load_ps(&src[i + 24]);
auto result0 = _mm256_mul_ps(a0, gain_vec);
auto result1 = _mm256_mul_ps(a1, gain_vec);
auto result2 = _mm256_mul_ps(a2, gain_vec);
auto result3 = _mm256_mul_ps(a3, gain_vec);
_mm256_store_ps(&dst[i], result0);
_mm256_store_ps(&dst[i + 8], result1);
_mm256_store_ps(&dst[i + 16], result2);
_mm256_store_ps(&dst[i + 24], result3);
}
for (; i + simd_width <= num_samples; i += simd_width)
{
auto a = _mm256_load_ps(&src[i]);
auto result = _mm256_mul_ps(a, gain_vec);
_mm256_store_ps(&dst[i], result);
}
for (; i < num_samples; ++i)
{
dst[i] = src[i] * gain;
}
}
SIMD_EXPORT float calculate_rms(const float *src, size_t num_samples)
{
ASSERT_ALIGNED(src, ALIGNMENT_AVX);
constexpr size_t simd_width = 8;
constexpr size_t unroll_factor = 4;
size_t i = 0;
auto sum_squares0 = _mm256_setzero_ps();
auto sum_squares1 = _mm256_setzero_ps();
auto sum_squares2 = _mm256_setzero_ps();
auto sum_squares3 = _mm256_setzero_ps();
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
{
const auto a0 = _mm256_load_ps(&src[i]);
const auto a1 = _mm256_load_ps(&src[i + 8]);
const auto a2 = _mm256_load_ps(&src[i + 16]);
const auto a3 = _mm256_load_ps(&src[i + 24]);
const auto squared0 = _mm256_mul_ps(a0, a0);
const auto squared1 = _mm256_mul_ps(a1, a1);
const auto squared2 = _mm256_mul_ps(a2, a2);
const auto squared3 = _mm256_mul_ps(a3, a3);
sum_squares0 = _mm256_add_ps(sum_squares0, squared0);
sum_squares1 = _mm256_add_ps(sum_squares1, squared1);
sum_squares2 = _mm256_add_ps(sum_squares2, squared2);
sum_squares3 = _mm256_add_ps(sum_squares3, squared3);
}
auto sum_squares = _mm256_add_ps(_mm256_add_ps(sum_squares0, sum_squares1),
_mm256_add_ps(sum_squares2, sum_squares3));
for (; i + simd_width <= num_samples; i += simd_width)
{
const auto a = _mm256_load_ps(&src[i]);
const auto squared = _mm256_mul_ps(a, a);
sum_squares = _mm256_add_ps(sum_squares, squared);
}
auto hadd1 = _mm256_hadd_ps(sum_squares, sum_squares);
auto hadd2 = _mm256_hadd_ps(hadd1, hadd1);
auto low = _mm256_extractf128_ps(hadd2, 0);
auto high = _mm256_extractf128_ps(hadd2, 1);
auto final_sum = _mm_add_ps(low, high);
double total_sum = _mm_cvtss_f32(final_sum);
for (; i < num_samples; ++i)
{
total_sum += static_cast<double>(src[i]) * static_cast<double>(src[i]);
}
return static_cast<float>(std::sqrt(total_sum / static_cast<double>(num_samples)));
}
SIMD_EXPORT float calculate_peak(const float *src, size_t num_samples)
{
ASSERT_ALIGNED(src, ALIGNMENT_AVX);
constexpr size_t simd_width = 8;
constexpr size_t unroll_factor = 4;
size_t i = 0;
auto peak_vec0 = _mm256_setzero_ps();
auto peak_vec1 = _mm256_setzero_ps();
auto peak_vec2 = _mm256_setzero_ps();
auto peak_vec3 = _mm256_setzero_ps();
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
{
const auto a0 = _mm256_load_ps(&src[i]);
const auto a1 = _mm256_load_ps(&src[i + 8]);
const auto a2 = _mm256_load_ps(&src[i + 16]);
const auto a3 = _mm256_load_ps(&src[i + 24]);
const auto abs_a0 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a0);
const auto abs_a1 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a1);
const auto abs_a2 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a2);
const auto abs_a3 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a3);
peak_vec0 = _mm256_max_ps(peak_vec0, abs_a0);
peak_vec1 = _mm256_max_ps(peak_vec1, abs_a1);
peak_vec2 = _mm256_max_ps(peak_vec2, abs_a2);
peak_vec3 = _mm256_max_ps(peak_vec3, abs_a3);
}
auto peak_vec = _mm256_max_ps(_mm256_max_ps(peak_vec0, peak_vec1),
_mm256_max_ps(peak_vec2, peak_vec3));
for (; i + simd_width <= num_samples; i += simd_width)
{
const auto a = _mm256_load_ps(&src[i]);
const auto abs_a = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a);
peak_vec = _mm256_max_ps(peak_vec, abs_a);
}
auto low = _mm256_extractf128_ps(peak_vec, 0);
auto high = _mm256_extractf128_ps(peak_vec, 1);
auto max_lane = _mm_max_ps(low, high);
auto temp1 = _mm_shuffle_ps(max_lane, max_lane, _MM_SHUFFLE(2, 3, 0, 1));
auto max1 = _mm_max_ps(max_lane, temp1);
auto temp2 = _mm_shuffle_ps(max1, max1, _MM_SHUFFLE(1, 0, 3, 2));
auto final_max = _mm_max_ps(max1, temp2);
float peak = _mm_cvtss_f32(final_max);
for (; i < num_samples; ++i)
{
float abs_sample = std::fabs(src[i]);
if (abs_sample > peak)
{
peak = abs_sample;
}
}
return peak;
}
SIMD_EXPORT void normalize_audio(const float *src, float *dst, float target_peak, size_t num_samples)
{
ASSERT_ALIGNED(src, ALIGNMENT_AVX);
ASSERT_ALIGNED(dst, ALIGNMENT_AVX);
if (num_samples == 0 || target_peak <= 0.0f)
{
return;
}
const float current_peak = calculate_peak(src, num_samples);
if (current_peak < 1e-10f)
{
constexpr size_t simd_width = 8;
constexpr size_t unroll_factor = 4;
auto zero_vec = _mm256_setzero_ps();
size_t i = 0;
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
{
_mm256_store_ps(&dst[i], zero_vec);
_mm256_store_ps(&dst[i + 8], zero_vec);
_mm256_store_ps(&dst[i + 16], zero_vec);
_mm256_store_ps(&dst[i + 24], zero_vec);
}
for (; i + simd_width <= num_samples; i += simd_width)
{
_mm256_store_ps(&dst[i], zero_vec);
}
for (; i < num_samples; ++i)
{
dst[i] = 0.0f;
}
return;
}
const float gain_factor = target_peak / current_peak;
apply_gain(src, dst, gain_factor, num_samples);
}
SIMD_EXPORT void stereo_to_mono(const float *stereo_src, float *mono_dst, size_t num_stereo_samples)
{
ASSERT_ALIGNED(stereo_src, ALIGNMENT_AVX);
ASSERT_ALIGNED(mono_dst, ALIGNMENT_AVX);
if (num_stereo_samples == 0)
{
return;
}
constexpr size_t simd_width = 8;
constexpr size_t unroll_factor = 4;
const auto half_vec = _mm256_set1_ps(0.5f);
size_t stereo_idx = 0;
size_t mono_idx = 0;
for (; stereo_idx + simd_width * 2 * unroll_factor <= num_stereo_samples * 2;
stereo_idx += simd_width * 2 * unroll_factor, mono_idx += simd_width * unroll_factor)
{
auto stereo0 = _mm256_load_ps(&stereo_src[stereo_idx]);
auto stereo1 = _mm256_load_ps(&stereo_src[stereo_idx + 8]);
auto stereo2 = _mm256_load_ps(&stereo_src[stereo_idx + 16]);
auto stereo3 = _mm256_load_ps(&stereo_src[stereo_idx + 24]);
auto stereo4 = _mm256_load_ps(&stereo_src[stereo_idx + 32]);
auto stereo5 = _mm256_load_ps(&stereo_src[stereo_idx + 40]);
auto stereo6 = _mm256_load_ps(&stereo_src[stereo_idx + 48]);
auto stereo7 = _mm256_load_ps(&stereo_src[stereo_idx + 56]);
auto left0 = _mm256_shuffle_ps(stereo0, stereo1, _MM_SHUFFLE(2, 0, 2, 0));
auto right0 = _mm256_shuffle_ps(stereo0, stereo1, _MM_SHUFFLE(3, 1, 3, 1));
auto left1 = _mm256_shuffle_ps(stereo2, stereo3, _MM_SHUFFLE(2, 0, 2, 0));
auto right1 = _mm256_shuffle_ps(stereo2, stereo3, _MM_SHUFFLE(3, 1, 3, 1));
auto left2 = _mm256_shuffle_ps(stereo4, stereo5, _MM_SHUFFLE(2, 0, 2, 0));
auto right2 = _mm256_shuffle_ps(stereo4, stereo5, _MM_SHUFFLE(3, 1, 3, 1));
auto left3 = _mm256_shuffle_ps(stereo6, stereo7, _MM_SHUFFLE(2, 0, 2, 0));
auto right3 = _mm256_shuffle_ps(stereo6, stereo7, _MM_SHUFFLE(3, 1, 3, 1));
left0 = _mm256_permute2f128_ps(left0, left0, 0x01);
right0 = _mm256_permute2f128_ps(right0, right0, 0x01);
left1 = _mm256_permute2f128_ps(left1, left1, 0x01);
right1 = _mm256_permute2f128_ps(right1, right1, 0x01);
left2 = _mm256_permute2f128_ps(left2, left2, 0x01);
right2 = _mm256_permute2f128_ps(right2, right2, 0x01);
left3 = _mm256_permute2f128_ps(left3, left3, 0x01);
right3 = _mm256_permute2f128_ps(right3, right3, 0x01);
auto mono0 = _mm256_mul_ps(_mm256_add_ps(left0, right0), half_vec);
auto mono1 = _mm256_mul_ps(_mm256_add_ps(left1, right1), half_vec);
auto mono2 = _mm256_mul_ps(_mm256_add_ps(left2, right2), half_vec);
auto mono3 = _mm256_mul_ps(_mm256_add_ps(left3, right3), half_vec);
_mm256_store_ps(&mono_dst[mono_idx], mono0);
_mm256_store_ps(&mono_dst[mono_idx + 8], mono1);
_mm256_store_ps(&mono_dst[mono_idx + 16], mono2);
_mm256_store_ps(&mono_dst[mono_idx + 24], mono3);
}
for (size_t i = stereo_idx / 2; i < num_stereo_samples; ++i)
{
const float left = stereo_src[i * 2];
const float right = stereo_src[i * 2 + 1];
mono_dst[i] = (left + right) * 0.5f;
}
}
SIMD_EXPORT void limit_audio(const float *src, float *dst, float threshold, float *limiter_state, float sample_rate,
size_t num_samples)
{
ASSERT_ALIGNED(src, ALIGNMENT_AVX);
ASSERT_ALIGNED(dst, ALIGNMENT_AVX);
if (num_samples == 0 || threshold <= 0.0f)
{
return;
}
constexpr size_t simd_width = 8;
constexpr size_t unroll_factor = 4;
constexpr float release_time = 0.05f;
float release_coeff = std::exp(-1.0f / (release_time * sample_rate));
float current_gain = limiter_state != nullptr ? *limiter_state : 1.0f;
size_t i = 0;
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
{
auto a0 = _mm256_load_ps(&src[i]);
auto a1 = _mm256_load_ps(&src[i + 8]);
auto a2 = _mm256_load_ps(&src[i + 16]);
auto a3 = _mm256_load_ps(&src[i + 24]);
auto abs_a0 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a0);
auto abs_a1 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a1);
auto abs_a2 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a2);
auto abs_a3 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a3);
auto max_abs = _mm256_max_ps(_mm256_max_ps(abs_a0, abs_a1),
_mm256_max_ps(abs_a2, abs_a3));
auto high = _mm256_extractf128_ps(max_abs, 1);
auto low = _mm256_extractf128_ps(max_abs, 0);
auto max_lane = _mm_max_ps(high, low);
auto temp1 = _mm_shuffle_ps(max_lane, max_lane, _MM_SHUFFLE(2, 3, 0, 1));
auto max1 = _mm_max_ps(max_lane, temp1);
auto temp2 = _mm_shuffle_ps(max1, max1, _MM_SHUFFLE(1, 0, 3, 2));
auto final_max = _mm_max_ps(max1, temp2);
float max_sample = _mm_cvtss_f32(final_max);
float target_gain = max_sample > threshold ? threshold / max_sample : 1.0f;
if (target_gain < current_gain)
{
current_gain = target_gain;
}
else
{
current_gain = target_gain + (current_gain - target_gain) * release_coeff;
}
auto gain_vec = _mm256_set1_ps(current_gain);
auto result0 = _mm256_mul_ps(a0, gain_vec);
auto result1 = _mm256_mul_ps(a1, gain_vec);
auto result2 = _mm256_mul_ps(a2, gain_vec);
auto result3 = _mm256_mul_ps(a3, gain_vec);
_mm256_store_ps(&dst[i], result0);
_mm256_store_ps(&dst[i + 8], result1);
_mm256_store_ps(&dst[i + 16], result2);
_mm256_store_ps(&dst[i + 24], result3);
}
for (; i + simd_width <= num_samples; i += simd_width)
{
auto a = _mm256_load_ps(&src[i]);
auto abs_a = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a);
auto high = _mm256_extractf128_ps(abs_a, 1);
auto low = _mm256_extractf128_ps(abs_a, 0);
auto max_lane = _mm_max_ps(high, low);
auto temp1 = _mm_shuffle_ps(max_lane, max_lane, _MM_SHUFFLE(2, 3, 0, 1));
auto max1 = _mm_max_ps(max_lane, temp1);
auto temp2 = _mm_shuffle_ps(max1, max1, _MM_SHUFFLE(1, 0, 3, 2));
auto final_max = _mm_max_ps(max1, temp2);
float max_sample = _mm_cvtss_f32(final_max);
float target_gain = max_sample > threshold ? threshold / max_sample : 1.0f;
if (target_gain < current_gain)
{
current_gain = target_gain;
}
else
{
current_gain = target_gain + (current_gain - target_gain) * release_coeff;
}
auto gain_vec = _mm256_set1_ps(current_gain);
auto result = _mm256_mul_ps(a, gain_vec);
_mm256_store_ps(&dst[i], result);
}
for (; i < num_samples; ++i)
{
float sample = src[i];
float abs_sample = std::fabs(sample);
float target_gain = abs_sample > threshold ? threshold / abs_sample : 1.0f;
if (target_gain < current_gain)
{
current_gain = target_gain;
}
else
{
current_gain = target_gain + (current_gain - target_gain) * release_coeff;
}
dst[i] = sample * current_gain;
}
if (limiter_state != nullptr)
{
*limiter_state = current_gain;
}
}
SIMD_EXPORT void fade_audio(const float *src, float *dst, size_t fade_in_samples, size_t fade_out_samples,
size_t num_samples)
{
ASSERT_ALIGNED(src, ALIGNMENT_AVX);
ASSERT_ALIGNED(dst, ALIGNMENT_AVX);
if (num_samples == 0)
{
return;
}
constexpr size_t simd_width = 8;
constexpr size_t unroll_factor = 4;
size_t i = 0;
if (fade_in_samples > 0)
{
const float fade_in_step = 1.0f / static_cast<float>(fade_in_samples);
for (; i + simd_width * unroll_factor <= std::min(fade_in_samples, num_samples); i += simd_width *
unroll_factor)
{
auto gain0 = _mm256_set_ps((i + 7) * fade_in_step, (i + 6) * fade_in_step, (i + 5) * fade_in_step,
(i + 4) * fade_in_step,
(i + 3) * fade_in_step, (i + 2) * fade_in_step, (i + 1) * fade_in_step,
i * fade_in_step);
auto gain1 = _mm256_set_ps((i + 15) * fade_in_step, (i + 14) * fade_in_step, (i + 13) * fade_in_step,
(i + 12) * fade_in_step,
(i + 11) * fade_in_step, (i + 10) * fade_in_step, (i + 9) * fade_in_step,
(i + 8) * fade_in_step);
auto gain2 = _mm256_set_ps((i + 23) * fade_in_step, (i + 22) * fade_in_step, (i + 21) * fade_in_step,
(i + 20) * fade_in_step,
(i + 19) * fade_in_step, (i + 18) * fade_in_step, (i + 17) * fade_in_step,
(i + 16) * fade_in_step);
auto gain3 = _mm256_set_ps((i + 31) * fade_in_step, (i + 30) * fade_in_step, (i + 29) * fade_in_step,
(i + 28) * fade_in_step,
(i + 27) * fade_in_step, (i + 26) * fade_in_step, (i + 25) * fade_in_step,
(i + 24) * fade_in_step);
auto a0 = _mm256_load_ps(&src[i]);
auto a1 = _mm256_load_ps(&src[i + 8]);
auto a2 = _mm256_load_ps(&src[i + 16]);
auto a3 = _mm256_load_ps(&src[i + 24]);
auto result0 = _mm256_mul_ps(a0, gain0);
auto result1 = _mm256_mul_ps(a1, gain1);
auto result2 = _mm256_mul_ps(a2, gain2);
auto result3 = _mm256_mul_ps(a3, gain3);
_mm256_store_ps(&dst[i], result0);
_mm256_store_ps(&dst[i + 8], result1);
_mm256_store_ps(&dst[i + 16], result2);
_mm256_store_ps(&dst[i + 24], result3);
}
for (; i < std::min(fade_in_samples, num_samples); ++i)
{
const float gain = static_cast<float>(i) / static_cast<float>(fade_in_samples);
dst[i] = src[i] * gain;
}
}
const size_t middle_start = fade_in_samples;
const size_t middle_end = num_samples > fade_out_samples ? num_samples - fade_out_samples : 0;
if (middle_end > middle_start)
{
for (size_t j = middle_start; j + simd_width * unroll_factor <= middle_end; j += simd_width *
unroll_factor)
{
auto a0 = _mm256_load_ps(&src[j]);
auto a1 = _mm256_load_ps(&src[j + 8]);
auto a2 = _mm256_load_ps(&src[j + 16]);
auto a3 = _mm256_load_ps(&src[j + 24]);
_mm256_store_ps(&dst[j], a0);
_mm256_store_ps(&dst[j + 8], a1);
_mm256_store_ps(&dst[j + 16], a2);
_mm256_store_ps(&dst[j + 24], a3);
}
for (size_t j = middle_start + ((middle_end - middle_start) / (simd_width * unroll_factor)) * (simd_width *
unroll_factor);
j < middle_end; ++j)
{
dst[j] = src[j];
}
}
if (fade_out_samples > 0 && num_samples > fade_out_samples)
{
const size_t fade_out_start = num_samples - fade_out_samples;
const float fade_out_step = 1.0f / static_cast<float>(fade_out_samples);
for (size_t j = fade_out_start; j + simd_width * unroll_factor <= num_samples; j += simd_width *
unroll_factor)
{
const size_t fade_out_offset = j - fade_out_start;
auto gain0 = _mm256_set_ps(1.0f - (fade_out_offset + 7) * fade_out_step,
1.0f - (fade_out_offset + 6) * fade_out_step,
1.0f - (fade_out_offset + 5) * fade_out_step,
1.0f - (fade_out_offset + 4) * fade_out_step,
1.0f - (fade_out_offset + 3) * fade_out_step,
1.0f - (fade_out_offset + 2) * fade_out_step,
1.0f - (fade_out_offset + 1) * fade_out_step,
1.0f - fade_out_offset * fade_out_step);
auto gain1 = _mm256_set_ps(1.0f - (fade_out_offset + 15) * fade_out_step,
1.0f - (fade_out_offset + 14) * fade_out_step,
1.0f - (fade_out_offset + 13) * fade_out_step,
1.0f - (fade_out_offset + 12) * fade_out_step,
1.0f - (fade_out_offset + 11) * fade_out_step,
1.0f - (fade_out_offset + 10) * fade_out_step,
1.0f - (fade_out_offset + 9) * fade_out_step,
1.0f - (fade_out_offset + 8) * fade_out_step);
auto gain2 = _mm256_set_ps(1.0f - (fade_out_offset + 23) * fade_out_step,
1.0f - (fade_out_offset + 22) * fade_out_step,
1.0f - (fade_out_offset + 21) * fade_out_step,
1.0f - (fade_out_offset + 20) * fade_out_step,
1.0f - (fade_out_offset + 19) * fade_out_step,
1.0f - (fade_out_offset + 18) * fade_out_step,
1.0f - (fade_out_offset + 17) * fade_out_step,
1.0f - (fade_out_offset + 16) * fade_out_step);
auto gain3 = _mm256_set_ps(1.0f - (fade_out_offset + 31) * fade_out_step,
1.0f - (fade_out_offset + 30) * fade_out_step,
1.0f - (fade_out_offset + 29) * fade_out_step,
1.0f - (fade_out_offset + 28) * fade_out_step,
1.0f - (fade_out_offset + 27) * fade_out_step,
1.0f - (fade_out_offset + 26) * fade_out_step,
1.0f - (fade_out_offset + 25) * fade_out_step,
1.0f - (fade_out_offset + 24) * fade_out_step);
auto a0 = _mm256_load_ps(&src[j]);
auto a1 = _mm256_load_ps(&src[j + 8]);
auto a2 = _mm256_load_ps(&src[j + 16]);
auto a3 = _mm256_load_ps(&src[j + 24]);
auto result0 = _mm256_mul_ps(a0, gain0);
auto result1 = _mm256_mul_ps(a1, gain1);
auto result2 = _mm256_mul_ps(a2, gain2);
auto result3 = _mm256_mul_ps(a3, gain3);
_mm256_store_ps(&dst[j], result0);
_mm256_store_ps(&dst[j + 8], result1);
_mm256_store_ps(&dst[j + 16], result2);
_mm256_store_ps(&dst[j + 24], result3);
}
for (size_t j = fade_out_start + ((fade_out_samples / (simd_width * unroll_factor)) * (simd_width *
unroll_factor));
j < num_samples; ++j)
{
const size_t fade_out_offset = j - fade_out_start;
const float gain = 1.0f - static_cast<float>(fade_out_offset) / static_cast<float>(fade_out_samples);
dst[j] = src[j] * gain;
}
}
}
SIMD_EXPORT void simple_eq(const float *src, float *dst, float low_gain, float mid_gain, float high_gain, float *eq_state,
size_t num_samples)
{
ASSERT_ALIGNED(src, ALIGNMENT_AVX);
ASSERT_ALIGNED(dst, ALIGNMENT_AVX);
if (num_samples == 0)
{
return;
}
constexpr size_t simd_width = 8;
constexpr size_t unroll_factor = 4;
constexpr float low_cutoff = 0.02f;
constexpr float high_cutoff = 0.1f;
constexpr float mid_factor = 0.7f;
float low_state = eq_state != nullptr ? *eq_state : 0.0f;
float high_state = eq_state != nullptr ? *(eq_state + 1) : 0.0f;
const auto low_gain_vec = _mm256_set1_ps(low_gain);
const auto mid_gain_vec = _mm256_set1_ps(mid_gain);
const auto high_gain_vec = _mm256_set1_ps(high_gain);
const auto low_cutoff_vec = _mm256_set1_ps(low_cutoff);
const auto high_cutoff_vec = _mm256_set1_ps(high_cutoff);
const auto mid_factor_vec = _mm256_set1_ps(mid_factor);
const auto one_minus_low_cutoff_vec = _mm256_set1_ps(1.0f - low_cutoff);
const auto one_minus_high_cutoff_vec = _mm256_set1_ps(1.0f - high_cutoff);
size_t i = 0;
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
{
auto input0 = _mm256_load_ps(&src[i]);
auto input1 = _mm256_load_ps(&src[i + 8]);
auto input2 = _mm256_load_ps(&src[i + 16]);
auto input3 = _mm256_load_ps(&src[i + 24]);
auto low_state_vec = _mm256_set1_ps(low_state);
auto low0 = _mm256_add_ps(_mm256_mul_ps(input0, low_cutoff_vec),
_mm256_mul_ps(low_state_vec, one_minus_low_cutoff_vec));
auto low1 = _mm256_add_ps(_mm256_mul_ps(input1, low_cutoff_vec),
_mm256_mul_ps(low0, one_minus_low_cutoff_vec));
auto low2 = _mm256_add_ps(_mm256_mul_ps(input2, low_cutoff_vec),
_mm256_mul_ps(low1, one_minus_low_cutoff_vec));
auto low3 = _mm256_add_ps(_mm256_mul_ps(input3, low_cutoff_vec),
_mm256_mul_ps(low2, one_minus_low_cutoff_vec));
auto high0 = _mm256_sub_ps(input0, low0);
auto high1 = _mm256_sub_ps(input1, low1);
auto high2 = _mm256_sub_ps(input2, low2);
auto high3 = _mm256_sub_ps(input3, low3);
auto high_state_vec = _mm256_set1_ps(high_state);
high0 = _mm256_add_ps(_mm256_mul_ps(high0, high_cutoff_vec),
_mm256_mul_ps(high_state_vec, one_minus_high_cutoff_vec));
high1 = _mm256_add_ps(_mm256_mul_ps(high1, high_cutoff_vec),
_mm256_mul_ps(high0, one_minus_high_cutoff_vec));
high2 = _mm256_add_ps(_mm256_mul_ps(high2, high_cutoff_vec),
_mm256_mul_ps(high1, one_minus_high_cutoff_vec));
high3 = _mm256_add_ps(_mm256_mul_ps(high3, high_cutoff_vec),
_mm256_mul_ps(high2, one_minus_high_cutoff_vec));
auto mid0 = _mm256_mul_ps(_mm256_sub_ps(_mm256_sub_ps(input0, low0), high0), mid_factor_vec);
auto mid1 = _mm256_mul_ps(_mm256_sub_ps(_mm256_sub_ps(input1, low1), high1), mid_factor_vec);
auto mid2 = _mm256_mul_ps(_mm256_sub_ps(_mm256_sub_ps(input2, low2), high2), mid_factor_vec);
auto mid3 = _mm256_mul_ps(_mm256_sub_ps(_mm256_sub_ps(input3, low3), high3), mid_factor_vec);
auto result0 = _mm256_add_ps(
_mm256_add_ps(_mm256_mul_ps(low0, low_gain_vec), _mm256_mul_ps(mid0, mid_gain_vec)),
_mm256_mul_ps(high0, high_gain_vec));
auto result1 = _mm256_add_ps(
_mm256_add_ps(_mm256_mul_ps(low1, low_gain_vec), _mm256_mul_ps(mid1, mid_gain_vec)),
_mm256_mul_ps(high1, high_gain_vec));
auto result2 = _mm256_add_ps(
_mm256_add_ps(_mm256_mul_ps(low2, low_gain_vec), _mm256_mul_ps(mid2, mid_gain_vec)),
_mm256_mul_ps(high2, high_gain_vec));
auto result3 = _mm256_add_ps(
_mm256_add_ps(_mm256_mul_ps(low3, low_gain_vec), _mm256_mul_ps(mid3, mid_gain_vec)),
_mm256_mul_ps(high3, high_gain_vec));
_mm256_store_ps(&dst[i], result0);
_mm256_store_ps(&dst[i + 8], result1);
_mm256_store_ps(&dst[i + 16], result2);
_mm256_store_ps(&dst[i + 24], result3);
auto low_temp = _mm256_extractf128_ps(low3, 1);
low_state = _mm_cvtss_f32(_mm_shuffle_ps(low_temp, low_temp, _MM_SHUFFLE(3, 3, 3, 3)));
auto high_temp = _mm256_extractf128_ps(high3, 1);
high_state = _mm_cvtss_f32(_mm_shuffle_ps(high_temp, high_temp, _MM_SHUFFLE(3, 3, 3, 3)));
}
for (; i < num_samples; ++i)
{
float input = src[i];
float low_output = low_cutoff * input + (1.0f - low_cutoff) * low_state;
low_state = low_output;
float high_input = input - low_output;
float high_output = high_cutoff * high_input + (1.0f - high_cutoff) * high_state;
high_state = high_output;
float mid_output = (input - low_output - high_output) * mid_factor;
dst[i] = low_output * low_gain + mid_output * mid_gain + high_output * high_gain;
}
if (eq_state != nullptr)
{
*eq_state = low_state;
*(eq_state + 1) = high_state;
}
}
}
#endif

View File

@@ -0,0 +1,7 @@
project(alicho_simd_avx512)
if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64")
simple_library(SHARED)
target_compile_options(${PROJECT_NAME} PRIVATE -mavx512f -mavx512bw -mavx512vl -mavx512dq)
target_link_libraries(${PROJECT_NAME} PUBLIC alicho_simd_interface)
endif()

View File

@@ -0,0 +1,756 @@
/**
* @file x86_avx512_audio_processing_func.cpp
* @brief x86 AVX-512音频处理函数实现
*/
#include "simd_interface.h"
#include <cmath>
#include <immintrin.h>
#include "aligned_allocator.h"
extern "C"
{
SIMD_EXPORT void fill_buffer(float *buffer, float value, size_t num_samples)
{
ASSERT_ALIGNED(buffer, ALIGNMENT_AVX512);
constexpr size_t simd_width = 16;
constexpr size_t unroll_factor = 4;
size_t i = 0;
auto value_vec = _mm512_set1_ps(value);
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
{
_mm512_store_ps(&buffer[i], value_vec);
_mm512_store_ps(&buffer[i + 16], value_vec);
_mm512_store_ps(&buffer[i + 32], value_vec);
_mm512_store_ps(&buffer[i + 48], value_vec);
}
for (; i + simd_width <= num_samples; i += simd_width)
{
_mm512_store_ps(&buffer[i], value_vec);
}
for (; i < num_samples; ++i)
{
buffer[i] = value;
}
}
SIMD_EXPORT void mix_audio(const float *src1, const float *src2, float *dst, size_t num_samples)
{
ASSERT_ALIGNED(src1, ALIGNMENT_AVX512);
ASSERT_ALIGNED(src2, ALIGNMENT_AVX512);
ASSERT_ALIGNED(dst, ALIGNMENT_AVX512);
constexpr size_t simd_width = 16;
constexpr size_t unroll_factor = 4;
size_t i = 0;
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
{
const auto a0 = _mm512_load_ps(&src1[i]);
const auto a1 = _mm512_load_ps(&src1[i + 16]);
const auto a2 = _mm512_load_ps(&src1[i + 32]);
const auto a3 = _mm512_load_ps(&src1[i + 48]);
const auto b0 = _mm512_load_ps(&src2[i]);
const auto b1 = _mm512_load_ps(&src2[i + 16]);
const auto b2 = _mm512_load_ps(&src2[i + 32]);
const auto b3 = _mm512_load_ps(&src2[i + 48]);
const auto result0 = _mm512_add_ps(a0, b0);
const auto result1 = _mm512_add_ps(a1, b1);
const auto result2 = _mm512_add_ps(a2, b2);
const auto result3 = _mm512_add_ps(a3, b3);
_mm512_store_ps(&dst[i], result0);
_mm512_store_ps(&dst[i + 16], result1);
_mm512_store_ps(&dst[i + 32], result2);
_mm512_store_ps(&dst[i + 48], result3);
}
for (; i + simd_width <= num_samples; i += simd_width)
{
auto a = _mm512_load_ps(&src1[i]);
auto b = _mm512_load_ps(&src2[i]);
auto result = _mm512_add_ps(a, b);
_mm512_store_ps(&dst[i], result);
}
for (; i < num_samples; ++i)
{
dst[i] = src1[i] + src2[i];
}
}
SIMD_EXPORT void apply_gain(const float *src, float *dst, float gain, size_t num_samples)
{
ASSERT_ALIGNED(src, ALIGNMENT_AVX512);
ASSERT_ALIGNED(dst, ALIGNMENT_AVX512);
constexpr size_t simd_width = 16;
constexpr size_t unroll_factor = 4;
size_t i = 0;
auto gain_vec = _mm512_set1_ps(gain);
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
{
auto a0 = _mm512_load_ps(&src[i]);
auto a1 = _mm512_load_ps(&src[i + 16]);
auto a2 = _mm512_load_ps(&src[i + 32]);
auto a3 = _mm512_load_ps(&src[i + 48]);
auto result0 = _mm512_mul_ps(a0, gain_vec);
auto result1 = _mm512_mul_ps(a1, gain_vec);
auto result2 = _mm512_mul_ps(a2, gain_vec);
auto result3 = _mm512_mul_ps(a3, gain_vec);
_mm512_store_ps(&dst[i], result0);
_mm512_store_ps(&dst[i + 16], result1);
_mm512_store_ps(&dst[i + 32], result2);
_mm512_store_ps(&dst[i + 48], result3);
}
for (; i + simd_width <= num_samples; i += simd_width)
{
auto a = _mm512_load_ps(&src[i]);
auto result = _mm512_mul_ps(a, gain_vec);
_mm512_store_ps(&dst[i], result);
}
for (; i < num_samples; ++i)
{
dst[i] = src[i] * gain;
}
}
SIMD_EXPORT float calculate_rms(const float *src, size_t num_samples)
{
ASSERT_ALIGNED(src, ALIGNMENT_AVX512);
constexpr size_t simd_width = 16;
constexpr size_t unroll_factor = 4;
size_t i = 0;
auto sum_squares0 = _mm512_setzero_ps();
auto sum_squares1 = _mm512_setzero_ps();
auto sum_squares2 = _mm512_setzero_ps();
auto sum_squares3 = _mm512_setzero_ps();
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
{
const auto a0 = _mm512_load_ps(&src[i]);
const auto a1 = _mm512_load_ps(&src[i + 16]);
const auto a2 = _mm512_load_ps(&src[i + 32]);
const auto a3 = _mm512_load_ps(&src[i + 48]);
const auto squared0 = _mm512_mul_ps(a0, a0);
const auto squared1 = _mm512_mul_ps(a1, a1);
const auto squared2 = _mm512_mul_ps(a2, a2);
const auto squared3 = _mm512_mul_ps(a3, a3);
sum_squares0 = _mm512_add_ps(sum_squares0, squared0);
sum_squares1 = _mm512_add_ps(sum_squares1, squared1);
sum_squares2 = _mm512_add_ps(sum_squares2, squared2);
sum_squares3 = _mm512_add_ps(sum_squares3, squared3);
}
auto sum_squares = _mm512_add_ps(_mm512_add_ps(sum_squares0, sum_squares1),
_mm512_add_ps(sum_squares2, sum_squares3));
for (; i + simd_width <= num_samples; i += simd_width)
{
const auto a = _mm512_load_ps(&src[i]);
const auto squared = _mm512_mul_ps(a, a);
sum_squares = _mm512_add_ps(sum_squares, squared);
}
double total_sum = _mm512_reduce_add_ps(sum_squares);
for (; i < num_samples; ++i)
{
total_sum += static_cast<double>(src[i]) * static_cast<double>(src[i]);
}
return static_cast<float>(std::sqrt(total_sum / static_cast<double>(num_samples)));
}
SIMD_EXPORT float calculate_peak(const float *src, size_t num_samples)
{
ASSERT_ALIGNED(src, ALIGNMENT_AVX512);
constexpr size_t simd_width = 16;
constexpr size_t unroll_factor = 4;
size_t i = 0;
auto peak_vec0 = _mm512_setzero_ps();
auto peak_vec1 = _mm512_setzero_ps();
auto peak_vec2 = _mm512_setzero_ps();
auto peak_vec3 = _mm512_setzero_ps();
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
{
const auto a0 = _mm512_load_ps(&src[i]);
const auto a1 = _mm512_load_ps(&src[i + 16]);
const auto a2 = _mm512_load_ps(&src[i + 32]);
const auto a3 = _mm512_load_ps(&src[i + 48]);
const auto abs_a0 = _mm512_andnot_ps(_mm512_set1_ps(-0.0f), a0);
const auto abs_a1 = _mm512_andnot_ps(_mm512_set1_ps(-0.0f), a1);
const auto abs_a2 = _mm512_andnot_ps(_mm512_set1_ps(-0.0f), a2);
const auto abs_a3 = _mm512_andnot_ps(_mm512_set1_ps(-0.0f), a3);
peak_vec0 = _mm512_max_ps(peak_vec0, abs_a0);
peak_vec1 = _mm512_max_ps(peak_vec1, abs_a1);
peak_vec2 = _mm512_max_ps(peak_vec2, abs_a2);
peak_vec3 = _mm512_max_ps(peak_vec3, abs_a3);
}
auto peak_vec = _mm512_max_ps(_mm512_max_ps(peak_vec0, peak_vec1),
_mm512_max_ps(peak_vec2, peak_vec3));
for (; i + simd_width <= num_samples; i += simd_width)
{
const auto a = _mm512_load_ps(&src[i]);
const auto abs_a = _mm512_andnot_ps(_mm512_set1_ps(-0.0f), a);
peak_vec = _mm512_max_ps(peak_vec, abs_a);
}
float peak = _mm512_reduce_max_ps(peak_vec);
for (; i < num_samples; ++i)
{
float abs_sample = std::fabs(src[i]);
if (abs_sample > peak)
{
peak = abs_sample;
}
}
return peak;
}
SIMD_EXPORT void normalize_audio(const float *src, float *dst, float target_peak, size_t num_samples)
{
ASSERT_ALIGNED(src, ALIGNMENT_AVX512);
ASSERT_ALIGNED(dst, ALIGNMENT_AVX512);
if (num_samples == 0 || target_peak <= 0.0f)
{
return;
}
const float current_peak = calculate_peak(src, num_samples);
if (current_peak < 1e-10f)
{
constexpr size_t simd_width = 16;
constexpr size_t unroll_factor = 4;
auto zero_vec = _mm512_setzero_ps();
size_t i = 0;
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
{
_mm512_store_ps(&dst[i], zero_vec);
_mm512_store_ps(&dst[i + 16], zero_vec);
_mm512_store_ps(&dst[i + 32], zero_vec);
_mm512_store_ps(&dst[i + 48], zero_vec);
}
for (; i + simd_width <= num_samples; i += simd_width)
{
_mm512_store_ps(&dst[i], zero_vec);
}
for (; i < num_samples; ++i)
{
dst[i] = 0.0f;
}
return;
}
const float gain_factor = target_peak / current_peak;
apply_gain(src, dst, gain_factor, num_samples);
}
SIMD_EXPORT void stereo_to_mono(const float *stereo_src, float *mono_dst, size_t num_stereo_samples)
{
ASSERT_ALIGNED(stereo_src, ALIGNMENT_AVX512);
ASSERT_ALIGNED(mono_dst, ALIGNMENT_AVX512);
if (num_stereo_samples == 0)
{
return;
}
constexpr size_t simd_width = 16;
constexpr size_t unroll_factor = 4;
const auto half_vec = _mm512_set1_ps(0.5f);
size_t stereo_idx = 0;
size_t mono_idx = 0;
for (; stereo_idx + simd_width * 2 * unroll_factor <= num_stereo_samples * 2;
stereo_idx += simd_width * 2 * unroll_factor, mono_idx += simd_width * unroll_factor)
{
auto stereo0 = _mm512_load_ps(&stereo_src[stereo_idx]);
auto stereo1 = _mm512_load_ps(&stereo_src[stereo_idx + 16]);
auto stereo2 = _mm512_load_ps(&stereo_src[stereo_idx + 32]);
auto stereo3 = _mm512_load_ps(&stereo_src[stereo_idx + 48]);
auto stereo4 = _mm512_load_ps(&stereo_src[stereo_idx + 64]);
auto stereo5 = _mm512_load_ps(&stereo_src[stereo_idx + 80]);
auto stereo6 = _mm512_load_ps(&stereo_src[stereo_idx + 96]);
auto stereo7 = _mm512_load_ps(&stereo_src[stereo_idx + 112]);
const auto even_mask = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
const auto odd_mask = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
auto left0 = _mm512_permutex2var_ps(stereo0, even_mask, stereo1);
auto right0 = _mm512_permutex2var_ps(stereo0, odd_mask, stereo1);
auto left1 = _mm512_permutex2var_ps(stereo2, even_mask, stereo3);
auto right1 = _mm512_permutex2var_ps(stereo2, odd_mask, stereo3);
auto left2 = _mm512_permutex2var_ps(stereo4, even_mask, stereo5);
auto right2 = _mm512_permutex2var_ps(stereo4, odd_mask, stereo5);
auto left3 = _mm512_permutex2var_ps(stereo6, even_mask, stereo7);
auto right3 = _mm512_permutex2var_ps(stereo6, odd_mask, stereo7);
auto mono0 = _mm512_mul_ps(_mm512_add_ps(left0, right0), half_vec);
auto mono1 = _mm512_mul_ps(_mm512_add_ps(left1, right1), half_vec);
auto mono2 = _mm512_mul_ps(_mm512_add_ps(left2, right2), half_vec);
auto mono3 = _mm512_mul_ps(_mm512_add_ps(left3, right3), half_vec);
_mm512_store_ps(&mono_dst[mono_idx], mono0);
_mm512_store_ps(&mono_dst[mono_idx + 16], mono1);
_mm512_store_ps(&mono_dst[mono_idx + 32], mono2);
_mm512_store_ps(&mono_dst[mono_idx + 48], mono3);
}
for (size_t i = stereo_idx / 2; i < num_stereo_samples; ++i)
{
const float left = stereo_src[i * 2];
const float right = stereo_src[i * 2 + 1];
mono_dst[i] = (left + right) * 0.5f;
}
}
SIMD_EXPORT void limit_audio(const float *src, float *dst, float threshold, float *limiter_state, float sample_rate,
size_t num_samples)
{
ASSERT_ALIGNED(src, ALIGNMENT_AVX512);
ASSERT_ALIGNED(dst, ALIGNMENT_AVX512);
if (num_samples == 0 || threshold <= 0.0f)
{
return;
}
constexpr size_t simd_width = 16;
constexpr size_t unroll_factor = 4;
constexpr float release_time = 0.05f;
float release_coeff = std::exp(-1.0f / (release_time * sample_rate));
float current_gain = limiter_state != nullptr ? *limiter_state : 1.0f;
size_t i = 0;
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
{
auto a0 = _mm512_load_ps(&src[i]);
auto a1 = _mm512_load_ps(&src[i + 16]);
auto a2 = _mm512_load_ps(&src[i + 32]);
auto a3 = _mm512_load_ps(&src[i + 48]);
auto abs_a0 = _mm512_abs_ps(a0);
auto abs_a1 = _mm512_abs_ps(a1);
auto abs_a2 = _mm512_abs_ps(a2);
auto abs_a3 = _mm512_abs_ps(a3);
auto max_abs = _mm512_max_ps(_mm512_max_ps(abs_a0, abs_a1),
_mm512_max_ps(abs_a2, abs_a3));
float max_sample = _mm512_reduce_max_ps(max_abs);
float target_gain = max_sample > threshold ? threshold / max_sample : 1.0f;
if (target_gain < current_gain)
{
current_gain = target_gain;
}
else
{
current_gain = target_gain + (current_gain - target_gain) * release_coeff;
}
auto gain_vec = _mm512_set1_ps(current_gain);
auto result0 = _mm512_mul_ps(a0, gain_vec);
auto result1 = _mm512_mul_ps(a1, gain_vec);
auto result2 = _mm512_mul_ps(a2, gain_vec);
auto result3 = _mm512_mul_ps(a3, gain_vec);
_mm512_store_ps(&dst[i], result0);
_mm512_store_ps(&dst[i + 16], result1);
_mm512_store_ps(&dst[i + 32], result2);
_mm512_store_ps(&dst[i + 48], result3);
}
for (; i + simd_width <= num_samples; i += simd_width)
{
auto a = _mm512_load_ps(&src[i]);
auto abs_a = _mm512_abs_ps(a);
float max_sample = _mm512_reduce_max_ps(abs_a);
float target_gain = max_sample > threshold ? threshold / max_sample : 1.0f;
if (target_gain < current_gain)
{
current_gain = target_gain;
}
else
{
current_gain = target_gain + (current_gain - target_gain) * release_coeff;
}
auto gain_vec = _mm512_set1_ps(current_gain);
auto result = _mm512_mul_ps(a, gain_vec);
_mm512_store_ps(&dst[i], result);
}
for (; i < num_samples; ++i)
{
float sample = src[i];
float abs_sample = std::fabs(sample);
float target_gain = abs_sample > threshold ? threshold / abs_sample : 1.0f;
if (target_gain < current_gain)
{
current_gain = target_gain;
}
else
{
current_gain = target_gain + (current_gain - target_gain) * release_coeff;
}
dst[i] = sample * current_gain;
}
if (limiter_state != nullptr)
{
*limiter_state = current_gain;
}
}
SIMD_EXPORT void fade_audio(const float *src, float *dst, size_t fade_in_samples, size_t fade_out_samples,
size_t num_samples)
{
ASSERT_ALIGNED(src, ALIGNMENT_AVX512);
ASSERT_ALIGNED(dst, ALIGNMENT_AVX512);
if (num_samples == 0)
{
return;
}
constexpr size_t simd_width = 16;
constexpr size_t unroll_factor = 4;
size_t i = 0;
if (fade_in_samples > 0)
{
const float fade_in_step = 1.0f / static_cast<float>(fade_in_samples);
for (; i + simd_width * unroll_factor <= std::min(fade_in_samples, num_samples); i += simd_width *
unroll_factor)
{
auto gain0 = _mm512_set_ps((i + 15) * fade_in_step, (i + 14) * fade_in_step, (i + 13) * fade_in_step,
(i + 12) * fade_in_step,
(i + 11) * fade_in_step, (i + 10) * fade_in_step, (i + 9) * fade_in_step,
(i + 8) * fade_in_step,
(i + 7) * fade_in_step, (i + 6) * fade_in_step, (i + 5) * fade_in_step,
(i + 4) * fade_in_step,
(i + 3) * fade_in_step, (i + 2) * fade_in_step, (i + 1) * fade_in_step,
i * fade_in_step);
auto gain1 = _mm512_set_ps((i + 31) * fade_in_step, (i + 30) * fade_in_step, (i + 29) * fade_in_step,
(i + 28) * fade_in_step,
(i + 27) * fade_in_step, (i + 26) * fade_in_step, (i + 25) * fade_in_step,
(i + 24) * fade_in_step,
(i + 23) * fade_in_step, (i + 22) * fade_in_step, (i + 21) * fade_in_step,
(i + 20) * fade_in_step,
(i + 19) * fade_in_step, (i + 18) * fade_in_step, (i + 17) * fade_in_step,
(i + 16) * fade_in_step);
auto gain2 = _mm512_set_ps((i + 47) * fade_in_step, (i + 46) * fade_in_step, (i + 45) * fade_in_step,
(i + 44) * fade_in_step,
(i + 43) * fade_in_step, (i + 42) * fade_in_step, (i + 41) * fade_in_step,
(i + 40) * fade_in_step,
(i + 39) * fade_in_step, (i + 38) * fade_in_step, (i + 37) * fade_in_step,
(i + 36) * fade_in_step,
(i + 35) * fade_in_step, (i + 34) * fade_in_step, (i + 33) * fade_in_step,
(i + 32) * fade_in_step);
auto gain3 = _mm512_set_ps((i + 63) * fade_in_step, (i + 62) * fade_in_step, (i + 61) * fade_in_step,
(i + 60) * fade_in_step,
(i + 59) * fade_in_step, (i + 58) * fade_in_step, (i + 57) * fade_in_step,
(i + 56) * fade_in_step,
(i + 55) * fade_in_step, (i + 54) * fade_in_step, (i + 53) * fade_in_step,
(i + 52) * fade_in_step,
(i + 51) * fade_in_step, (i + 50) * fade_in_step, (i + 49) * fade_in_step,
(i + 48) * fade_in_step);
auto a0 = _mm512_load_ps(&src[i]);
auto a1 = _mm512_load_ps(&src[i + 16]);
auto a2 = _mm512_load_ps(&src[i + 32]);
auto a3 = _mm512_load_ps(&src[i + 48]);
auto result0 = _mm512_mul_ps(a0, gain0);
auto result1 = _mm512_mul_ps(a1, gain1);
auto result2 = _mm512_mul_ps(a2, gain2);
auto result3 = _mm512_mul_ps(a3, gain3);
_mm512_store_ps(&dst[i], result0);
_mm512_store_ps(&dst[i + 16], result1);
_mm512_store_ps(&dst[i + 32], result2);
_mm512_store_ps(&dst[i + 48], result3);
}
for (; i < std::min(fade_in_samples, num_samples); ++i)
{
const float gain = static_cast<float>(i) / static_cast<float>(fade_in_samples);
dst[i] = src[i] * gain;
}
}
const size_t middle_start = fade_in_samples;
const size_t middle_end = num_samples > fade_out_samples ? num_samples - fade_out_samples : 0;
if (middle_end > middle_start)
{
for (size_t j = middle_start; j + simd_width * unroll_factor <= middle_end; j += simd_width *
unroll_factor)
{
auto a0 = _mm512_load_ps(&src[j]);
auto a1 = _mm512_load_ps(&src[j + 16]);
auto a2 = _mm512_load_ps(&src[j + 32]);
auto a3 = _mm512_load_ps(&src[j + 48]);
_mm512_store_ps(&dst[j], a0);
_mm512_store_ps(&dst[j + 16], a1);
_mm512_store_ps(&dst[j + 32], a2);
_mm512_store_ps(&dst[j + 48], a3);
}
for (size_t j = middle_start + ((middle_end - middle_start) / (simd_width * unroll_factor)) * (simd_width *
unroll_factor);
j < middle_end; ++j)
{
dst[j] = src[j];
}
}
if (fade_out_samples > 0 && num_samples > fade_out_samples)
{
const size_t fade_out_start = num_samples - fade_out_samples;
const float fade_out_step = 1.0f / static_cast<float>(fade_out_samples);
for (size_t j = fade_out_start; j + simd_width * unroll_factor <= num_samples; j += simd_width *
unroll_factor)
{
const size_t fade_out_offset = j - fade_out_start;
auto gain0 = _mm512_set_ps(1.0f - (fade_out_offset + 15) * fade_out_step,
1.0f - (fade_out_offset + 14) * fade_out_step,
1.0f - (fade_out_offset + 13) * fade_out_step,
1.0f - (fade_out_offset + 12) * fade_out_step,
1.0f - (fade_out_offset + 11) * fade_out_step,
1.0f - (fade_out_offset + 10) * fade_out_step,
1.0f - (fade_out_offset + 9) * fade_out_step,
1.0f - (fade_out_offset + 8) * fade_out_step,
1.0f - (fade_out_offset + 7) * fade_out_step,
1.0f - (fade_out_offset + 6) * fade_out_step,
1.0f - (fade_out_offset + 5) * fade_out_step,
1.0f - (fade_out_offset + 4) * fade_out_step,
1.0f - (fade_out_offset + 3) * fade_out_step,
1.0f - (fade_out_offset + 2) * fade_out_step,
1.0f - (fade_out_offset + 1) * fade_out_step,
1.0f - fade_out_offset * fade_out_step);
auto gain1 = _mm512_set_ps(1.0f - (fade_out_offset + 31) * fade_out_step,
1.0f - (fade_out_offset + 30) * fade_out_step,
1.0f - (fade_out_offset + 29) * fade_out_step,
1.0f - (fade_out_offset + 28) * fade_out_step,
1.0f - (fade_out_offset + 27) * fade_out_step,
1.0f - (fade_out_offset + 26) * fade_out_step,
1.0f - (fade_out_offset + 25) * fade_out_step,
1.0f - (fade_out_offset + 24) * fade_out_step,
1.0f - (fade_out_offset + 23) * fade_out_step,
1.0f - (fade_out_offset + 22) * fade_out_step,
1.0f - (fade_out_offset + 21) * fade_out_step,
1.0f - (fade_out_offset + 20) * fade_out_step,
1.0f - (fade_out_offset + 19) * fade_out_step,
1.0f - (fade_out_offset + 18) * fade_out_step,
1.0f - (fade_out_offset + 17) * fade_out_step,
1.0f - (fade_out_offset + 16) * fade_out_step);
auto gain2 = _mm512_set_ps(1.0f - (fade_out_offset + 47) * fade_out_step,
1.0f - (fade_out_offset + 46) * fade_out_step,
1.0f - (fade_out_offset + 45) * fade_out_step,
1.0f - (fade_out_offset + 44) * fade_out_step,
1.0f - (fade_out_offset + 43) * fade_out_step,
1.0f - (fade_out_offset + 42) * fade_out_step,
1.0f - (fade_out_offset + 41) * fade_out_step,
1.0f - (fade_out_offset + 40) * fade_out_step,
1.0f - (fade_out_offset + 39) * fade_out_step,
1.0f - (fade_out_offset + 38) * fade_out_step,
1.0f - (fade_out_offset + 37) * fade_out_step,
1.0f - (fade_out_offset + 36) * fade_out_step,
1.0f - (fade_out_offset + 35) * fade_out_step,
1.0f - (fade_out_offset + 34) * fade_out_step,
1.0f - (fade_out_offset + 33) * fade_out_step,
1.0f - (fade_out_offset + 32) * fade_out_step);
auto gain3 = _mm512_set_ps(1.0f - (fade_out_offset + 63) * fade_out_step,
1.0f - (fade_out_offset + 62) * fade_out_step,
1.0f - (fade_out_offset + 61) * fade_out_step,
1.0f - (fade_out_offset + 60) * fade_out_step,
1.0f - (fade_out_offset + 59) * fade_out_step,
1.0f - (fade_out_offset + 58) * fade_out_step,
1.0f - (fade_out_offset + 57) * fade_out_step,
1.0f - (fade_out_offset + 56) * fade_out_step,
1.0f - (fade_out_offset + 55) * fade_out_step,
1.0f - (fade_out_offset + 54) * fade_out_step,
1.0f - (fade_out_offset + 53) * fade_out_step,
1.0f - (fade_out_offset + 52) * fade_out_step,
1.0f - (fade_out_offset + 51) * fade_out_step,
1.0f - (fade_out_offset + 50) * fade_out_step,
1.0f - (fade_out_offset + 49) * fade_out_step,
1.0f - (fade_out_offset + 48) * fade_out_step);
auto a0 = _mm512_load_ps(&src[j]);
auto a1 = _mm512_load_ps(&src[j + 16]);
auto a2 = _mm512_load_ps(&src[j + 32]);
auto a3 = _mm512_load_ps(&src[j + 48]);
auto result0 = _mm512_mul_ps(a0, gain0);
auto result1 = _mm512_mul_ps(a1, gain1);
auto result2 = _mm512_mul_ps(a2, gain2);
auto result3 = _mm512_mul_ps(a3, gain3);
_mm512_store_ps(&dst[j], result0);
_mm512_store_ps(&dst[j + 16], result1);
_mm512_store_ps(&dst[j + 32], result2);
_mm512_store_ps(&dst[j + 48], result3);
}
for (size_t j = fade_out_start + ((fade_out_samples / (simd_width * unroll_factor)) * (simd_width *
unroll_factor));
j < num_samples; ++j)
{
const size_t fade_out_offset = j - fade_out_start;
const float gain = 1.0f - static_cast<float>(fade_out_offset) / static_cast<float>(fade_out_samples);
dst[j] = src[j] * gain;
}
}
}
SIMD_EXPORT void simple_eq(const float *src, float *dst, float low_gain, float mid_gain, float high_gain,
float *eq_state, size_t num_samples)
{
ASSERT_ALIGNED(src, ALIGNMENT_AVX512);
ASSERT_ALIGNED(dst, ALIGNMENT_AVX512);
if (num_samples == 0)
{
return;
}
constexpr size_t simd_width = 16;
constexpr size_t unroll_factor = 4;
constexpr float low_cutoff = 0.02f;
constexpr float high_cutoff = 0.1f;
constexpr float mid_factor = 0.7f;
float low_state = eq_state != nullptr ? *eq_state : 0.0f;
float high_state = eq_state != nullptr ? *(eq_state + 1) : 0.0f;
const auto low_gain_vec = _mm512_set1_ps(low_gain);
const auto mid_gain_vec = _mm512_set1_ps(mid_gain);
const auto high_gain_vec = _mm512_set1_ps(high_gain);
const auto low_cutoff_vec = _mm512_set1_ps(low_cutoff);
const auto high_cutoff_vec = _mm512_set1_ps(high_cutoff);
const auto mid_factor_vec = _mm512_set1_ps(mid_factor);
const auto one_minus_low_cutoff_vec = _mm512_set1_ps(1.0f - low_cutoff);
const auto one_minus_high_cutoff_vec = _mm512_set1_ps(1.0f - high_cutoff);
size_t i = 0;
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
{
auto input0 = _mm512_load_ps(&src[i]);
auto input1 = _mm512_load_ps(&src[i + 16]);
auto input2 = _mm512_load_ps(&src[i + 32]);
auto input3 = _mm512_load_ps(&src[i + 48]);
auto low_state_vec = _mm512_set1_ps(low_state);
auto low0 = _mm512_fmadd_ps(input0, low_cutoff_vec, _mm512_mul_ps(low_state_vec, one_minus_low_cutoff_vec));
auto low1 = _mm512_fmadd_ps(input1, low_cutoff_vec, _mm512_mul_ps(low0, one_minus_low_cutoff_vec));
auto low2 = _mm512_fmadd_ps(input2, low_cutoff_vec, _mm512_mul_ps(low1, one_minus_low_cutoff_vec));
auto low3 = _mm512_fmadd_ps(input3, low_cutoff_vec, _mm512_mul_ps(low2, one_minus_low_cutoff_vec));
auto high0 = _mm512_sub_ps(input0, low0);
auto high1 = _mm512_sub_ps(input1, low1);
auto high2 = _mm512_sub_ps(input2, low2);
auto high3 = _mm512_sub_ps(input3, low3);
auto high_state_vec = _mm512_set1_ps(high_state);
high0 = _mm512_fmadd_ps(high0, high_cutoff_vec, _mm512_mul_ps(high_state_vec, one_minus_high_cutoff_vec));
high1 = _mm512_fmadd_ps(high1, high_cutoff_vec, _mm512_mul_ps(high0, one_minus_high_cutoff_vec));
high2 = _mm512_fmadd_ps(high2, high_cutoff_vec, _mm512_mul_ps(high1, one_minus_high_cutoff_vec));
high3 = _mm512_fmadd_ps(high3, high_cutoff_vec, _mm512_mul_ps(high2, one_minus_high_cutoff_vec));
auto mid0 = _mm512_mul_ps(_mm512_sub_ps(_mm512_sub_ps(input0, low0), high0), mid_factor_vec);
auto mid1 = _mm512_mul_ps(_mm512_sub_ps(_mm512_sub_ps(input1, low1), high1), mid_factor_vec);
auto mid2 = _mm512_mul_ps(_mm512_sub_ps(_mm512_sub_ps(input2, low2), high2), mid_factor_vec);
auto mid3 = _mm512_mul_ps(_mm512_sub_ps(_mm512_sub_ps(input3, low3), high3), mid_factor_vec);
auto result0 = _mm512_fmadd_ps(low0, low_gain_vec,
_mm512_fmadd_ps(mid0, mid_gain_vec, _mm512_mul_ps(high0, high_gain_vec)));
auto result1 = _mm512_fmadd_ps(low1, low_gain_vec,
_mm512_fmadd_ps(mid1, mid_gain_vec, _mm512_mul_ps(high1, high_gain_vec)));
auto result2 = _mm512_fmadd_ps(low2, low_gain_vec,
_mm512_fmadd_ps(mid2, mid_gain_vec, _mm512_mul_ps(high2, high_gain_vec)));
auto result3 = _mm512_fmadd_ps(low3, low_gain_vec,
_mm512_fmadd_ps(mid3, mid_gain_vec, _mm512_mul_ps(high3, high_gain_vec)));
_mm512_store_ps(&dst[i], result0);
_mm512_store_ps(&dst[i + 16], result1);
_mm512_store_ps(&dst[i + 32], result2);
_mm512_store_ps(&dst[i + 48], result3);
__m128 low_temp = _mm512_extractf32x4_ps(low3, 3);
low_state = _mm_cvtss_f32(low_temp);
__m128 high_temp = _mm512_extractf32x4_ps(high3, 3);
high_state = _mm_cvtss_f32(high_temp);
}
for (; i < num_samples; ++i)
{
float input = src[i];
float low_output = low_cutoff * input + (1.0f - low_cutoff) * low_state;
low_state = low_output;
float high_input = input - low_output;
float high_output = high_cutoff * high_input + (1.0f - high_cutoff) * high_state;
high_state = high_output;
float mid_output = (input - low_output - high_output) * mid_factor;
dst[i] = low_output * low_gain + mid_output * mid_gain + high_output * high_gain;
}
if (eq_state != nullptr)
{
*eq_state = low_state;
*(eq_state + 1) = high_state;
}
}
}

View File

@@ -1,428 +0,0 @@
/**
* @file simd_func_dispatcher.cpp
* @brief SIMD函数调度器实 - 运行时函数分发的核心实现
*
* 本文件实现了SIMD数调度器的核心功能包括
* - 函数注册表状态查询和调试输出
* - 版本枚举与字符串之间的双向转换
* - 运行时函数版本选择的辅助功能
*
* 核心实现功能:
* ============================================================================
* 1. print_registry_status() - 调试诊断功
* - 遍历并输出所有已注册的函数及其可用版本
* - 用于运行时验证函数注册是否正确
* - 帮助开发者了解当前可用的SIMD优化函数
*
* 2. simd_func_version_to_string() - 版本到字符串转换
* - 将枚举值转换为可读的字符串表示
* - 用于日志输出、调试信息和用户界面显示
* - 采用switch-case实现确保编译时类型安全
*
* 3. string_to_simd_func_version() - 字符串到版本转换
* - 将字符串解析为版本枚举值
* - 用于配置文件解析、命令行参数处理
* - 提供回退机制无法识别时返回SCALAR版本
*
* 实现特点:
* ============================================================================
* - 简洁明了:实现直观,易于维护和扩展
* - 类型安全:使用强类型枚举,避免魔法数字
* - 完整映射覆盖所有定义的SIMD版本
* - 健壮性:处理未知版本的边界情况
* - 零依赖:仅依赖标准库和项目头文件
*
* 性能考虑:
* ============================================================================
* - 版本转换函数使用简单的条件判断,性能开销可忽略
* - print_registry_status() 仅用于调试,不在性能关键路径上
* - 字符串比较采用高效的std::string相等性判断
*
* 设计模式:
* ============================================================================
* - 该文件是simd_func_dispatcher类的实现部分
* - 采用了接口与实现分离的设计
* - 头文件定义接口和模板实现cpp文件实现非模板函数
*
* @note 这些函数主要用于调试、日志和配置解析,不在性能关键路径上
* @see simd_func_dispatcher.h 查看完整的类定义和接口说明
*/
#include "simd_func_dispatcher.h"
#include <cstdio>
/**
* @brief 打印函数注册表状态 - 调试和诊断工具
*
* 该函数遍历并打印所有已注册到调度器的SIMD函数及其可用版本
* 是一个重要的调试和诊断工具。
*
* 功能详述:
* ============================================================================
* 1. 遍历函数注册表
* - 访问func_registry_中每个函数名和持有者对
* - 使用基于范围的for循环提高代码可读性
*
* 2. 获取版本信息
* - 调用holder的has_implementation()检查是否有实现
* - 如有实现调用get_available_versions()获取所有已注册版本
*
* 3. 格式化输出
* - 函数名:清晰标识当前函数
* - 版本列表展示所有可用的SIMD优化版本
* - 未实现提示:明确标识未注册的函数
*
* 输出格式示例:
* ============================================================================
* ```
* Registered SIMD Functions:
* Function: process_audio
* Available Versions: SCALAR SSE4 AVX2
* Function: mix_channels
* Available Versions: SCALAR NEON
* Function: apply_gain
* No implementations registered.
* ```
*
* 使用场景:
* ============================================================================
* - 程序启动时验证函数注册是否正确
* - 调试时检查特定函数的可用版本
* - 性能分析时了解当前使用的优化级别
* - 单元测试中验证注册逻辑
*
* 性能考虑:
* ============================================================================
* - 该函数仅用于调试,不应在性能关键代码中频繁调用
* - 使用printf而非std::cout以减少头文件依赖和提高输出性能
* - 遍历整个注册表的时间复杂度为O(n*m)n为函数数量m为平均版本数
*
* @note 这是一个const成员数不会修改调度器状态
* @note 输出直接发送到stdout适合命令行程序使用
*/
void simd_func_dispatcher::print_registry_status() const {
// 打印标题,标识输出内容
printf("Registered SIMD Functions:\n");
// 遍历函数注册表中的所有条目
// pair.first: 函数名称std::string
// pair.second: 函数持有者的unique_ptrfunc_holder_base*
for (const auto& pair : func_registry_) {
const auto& func_name = pair.first; // 获取函数名称
const auto& holder = pair.second; // 获取函数持有者
// 输出当前函数名称
printf("Function: %s\n", func_name.c_str());
// 检查该函数是否有任何版本的实现
if (holder->has_implementation()) {
// 获取所有已注册的版本列表
auto versions = holder->get_available_versions();
// 输出版本列表的标题
printf(" Available Versions: ");
// 遍历并输出每个可用版本
// 使用switch-case将枚值转换为可读的字符串
for (const auto& version : versions) {
switch (version) {
// x86/x64架构的SIMD版
case simd_func_version::SCALAR:
printf("SCALAR "); // 标量版本无SIMD化
break;
case simd_func_version::SSE:
printf("SSE "); // SSE/SSE2版本128位量
break;
case simd_func_version::SSE3:
printf("SSE3 "); // SSE3/SSSE3版本
break;
case simd_func_version::SSE4:
printf("SSE4 "); // SSE4.1/SSE4.2版本
break;
case simd_func_version::AVX:
printf("AVX "); // AVX版本256位向量
break;
case simd_func_version::AVX2:
printf("AVX2 "); // AVX2+FMA版本
break;
case simd_func_version::AVX512:
printf("AVX512 "); // AVX-512版本512位量
break;
// ARM架构的SIMD版本
case simd_func_version::NEON:
printf("NEON "); // ARM NEON版本
break;
case simd_func_version::NEON_FP16:
printf("NEON_FP16 "); // NEON半精度浮点版
break;
// RISC-V架构的向量扩展
case simd_func_version::VECTOR:
printf("VECTOR "); // RISC-V向量扩展
break;
// 处理未知版本(理论上不应出现)
default:
break;
}
}
// 版本列表输出完毕,换行
printf("\n");
}
else {
// 该函数尚未注册任何实现版本
printf(" No implementations registered.\n");
}
}
}
/**
* @brief SIMD版本枚举转字串 - 将版本枚举值转换为可读字符串
* @param version SIMD函数版本枚举
* @return 对应的版本名称字符串C风格字符串
*
* 该函数提供版本枚举到字符串的标准转换,用于:
* - 日志输出和调试信息
* - 用户界面显示
* - 配置文件生成
* - 错误消息构造
*
* 实现策略:
* ============================================================================
* 1. 使用switch-case实现完全映射
* - 编译器可以检测遗漏的case分支
* - 保证类型安全,避免隐式转换
* - 性能优秀,通常编译为跳转表
*
* 2. 返回C字符串而非std::string
* - 避免内存分配开销
* - 字符串字面量存储在只读数据段
* - 生命周期为整个程序运行期
*
* 3. 提供默认处理
* - 对于未识别的枚举值返回"UNKNOWN"
* - 提高代码健壮性,防止未定义行为
*
* 映射关系:
* ============================================================================
* | 枚举值 | 返回字符串 | 说明 |
* |-------------------|--------------|------------------------|
* | SCALAR | "SCALAR" | 标量实现 |
* | SSE | "SSE" | SSE/SSE2指令集 |
* | SSE3 | "SSE3" | SSE3/SSSE3指令集 |
* | SSE4 | "SSE4" | SSE4.1/4.2指令集 |
* | AVX | "AVX" | AVX指令集 |
* | AVX2 | "AVX2" | AVX2+FMA指令集 |
* | AVX512 | "AVX512" | AVX-512指令集 |
* | NEON | "NEON" | ARM NEON指令集 |
* | NEON_FP16 | "NEON_FP16" | NEON半精度浮点 |
* | VECTOR | "VECTOR" | RISC-V向量扩展 |
* | 其他 | "UNKNOWN" | 未知或非法值 |
*
* 使用示例:
* ============================================================================
* @code
* // 日志输出
* const char* name = simd_func_version_to_string(simd_func_version::AVX2);
* logger->info("Using SIMD version: {}", name); // 输出: Using SIMD version: AVX2
*
* // 调试信息
* printf("Current version: %s\n", simd_func_version_to_string(current_version));
*
* // 配置文件生成
* config_file << "preferred_version=" << simd_func_version_to_string(preferred) << "\n";
* @endcode
*
* 性能特性:
* ============================================================================
* - 时间复杂度O(1) - 编译器通常优化为跳转表或二分查找
* - 空间复杂度O(1) - 字符串字面量存储在只读数据段
* - 无内存分配:返回静态字符串,无运行时开销
* - 线程安全:只读操作,无共享状态修改
*
* @note 返回的字符串为静态存储,调用者不应修改或释放
* @note constexpr修饰符未使用是因为字符串字面量返回类型的限制
* @see string_to_simd_func_version() 执行反向换
*/
const char* simd_func_version_to_string(simd_func_version version) {
// 使用switch-case实现全映射
// 编译器会检测是否遗漏case分支如果使用-Wswitch警告
switch (version) {
// 标量版本 - 基础实现,所有平台都支持
case simd_func_version::SCALAR:
return "SCALAR";
// x86/x64 SIMD指令集版本发展顺序
case simd_func_version::SSE:
return "SSE"; // 2003年Pentium 4引入128位向量
case simd_func_version::SSE3:
return "SSE3"; // 2006年Core微架构增强浮点运算
case simd_func_version::SSE4:
return "SSE4"; // 2008年Nehalem微架增强整数和字符串处理
case simd_func_version::AVX:
return "AVX"; // 2011年Sandy Bridge256位向量
case simd_func_version::AVX2:
return "AVX2"; // 2013年Haswell完256位整数运算+FMA
case simd_func_version::AVX512:
return "AVX512"; // 2016年Xeon Phi/Skylake-X512位量
// ARM SIMD指令集版本
case simd_func_version::NEON:
return "NEON"; // ARMv8-A标准128位量
case simd_func_version::NEON_FP16:
return "NEON_FP16"; // ARMv8.2-A硬件半精度浮点支持
// RISC-V向量扩展
case simd_func_version::VECTOR:
return "VECTOR"; // RISC-V V扩展可变长度量
// 默认情况:处理未知或非法的枚举值
// 这提供了一层安全保障,虽然理论上不应到达这里
default:
break;
}
// 返回未知版本标识
// 这种情况可能发生在:
// 1. 枚举值被错误地强制转换
// 2. 内存损坏导致枚举值异常
// 3. 跨版本兼容性问题
return "UNKNOWN";
}
/**
* @brief 字符串转SIMD版本举 - 将字符串解析为版本枚举值
* @param version_str 版本名称字符串(如"AVX2"、"NEON"等)
* @return 对应的SIMD函数版枚举值
*
* 该函数将字符串表示的版本名称转换为枚举值,用于:
* - 配置文件解析读取用户指定的SIMD版本偏好
* - 命令行参数处理(--simd-version=AVX2
* - 环境变量解析SIMD_VERSION=SSE4
* - 测试和调试(手动指定测试版本)
*
* 实现策略:
* ============================================================================
* 1. 线性字符串比较
* - 按从高到低的版本顺序检查
* - 使用std::string的相比较运算符
* - 大小写敏感匹配
*
* 2. 回退到SCALAR版本
* - 无法识别的字符串返回最安全的SCALAR版本
* - 保证函数永远返回有效值
* - 避免程序因无效输入而崩溃
*
* 3. 优化潜力
* - 当前实现简单但足够高效(此函数不在热路径上)
* - 可能的优化哈希表查找、Trie树、完美哈希
* - 未优化原因:此函数主要在启动时调用,性能影响可忽略
*
* 支持的字符串:
* ============================================================================
* | 输入字符串 | 返回枚举值 | 备注 |
* |------------------|------------------------|---------------------|
* | "SCALAR" | simd_func_version::SCALAR | 标量实现 |
* | "SSE" | simd_func_version::SSE | SSE/SSE2 |
* | "SSE3" | simd_func_version::SSE3 | SSE3/SSSE3 |
* | "SSE4" | simd_func_version::SSE4 | SSE4.1/4.2 |
* | "AVX" | simd_func_version::AVX | AVX指令集 |
* | "AVX2" | simd_func_version::AVX2 | AVX2+FMA |
* | "AVX512" | simd_func_version::AVX512 | AVX-512 |
* | "NEON" | simd_func_version::NEON | ARM NEON |
* | "NEON_FP16" | simd_func_version::NEON_FP16 | NEON半精度 |
* | "VECTOR" | simd_func_version::VECTOR | RISC-V向量 |
* | 其他任何字符串 | simd_func_version::SCALAR | 默认回退 |
*
* 使用示例:
* ============================================================================
* @code
* // 配置文件解析
* std::string config_version = config["simd_version"];
* auto version = string_to_simd_func_version(config_version);
*
* // 命令行参数处理
* if (argc > 1) {
* auto preferred = string_to_simd_func_version(argv[1]);
* // 使用preferred版本...
* }
*
* // 环境变量解析
* const char* env_version = std::getenv("SIMD_VERSION");
* if (env_version) {
* auto version = string_to_simd_func_version(env_version);
* }
*
* // 测试中指定版本
* auto test_version = string_to_simd_func_version("AVX2");
* test_with_version(test_version);
* @endcode
*
* 设计考虑:
* ============================================================================
* 1. 大小写敏感
* - 当前实现要求精确匹配("AVX2"而非"avx2"
* - 原因:保持一致性,避免混淆
* - 改进:可添加大小写不敏感版本或预处理转换
*
* 2. 错误处理策略
* - 采用"宽容"策略无效输入返回SCALAR而非抛异常
* - 优点:避免程序崩溃,提供最低限度的功能
* - 缺点:可能掩盖配置错误
* - 建议:调用者应验证输入或记录回退日志
*
* 3. 性能特性
* - 最坏情况O(n) - n为版本数量约10次字符串比较
* - 平均情况:取决于输入分布
* - 优化可能哈希表O(1),但增加复杂度
* - 当前选择:简单性优先(此函数不在性能关键路径)
*
* @note 字符串比较是大小写敏感的
* @note 无法识别的字符串会回退到SCALAR版本不会抛出异常
* @note 建议在调用后验证返回值是否符合预期
* @see simd_func_version_to_string() 执行反向转换
*/
simd_func_version string_to_simd_func_version(const std::string& version_str) {
// 按版本由高到低的顺序进行检查
// 这样的顺序有助于代码可读性,但对性能无实质影响
// 标量版本检查
if (version_str == "SCALAR")
return simd_func_version::SCALAR;
// x86/x64 SIMD版本检查按令集发展顺序
if (version_str == "SSE")
return simd_func_version::SSE;
if (version_str == "SSE3")
return simd_func_version::SSE3;
if (version_str == "SSE4")
return simd_func_version::SSE4;
if (version_str == "AVX")
return simd_func_version::AVX;
if (version_str == "AVX2")
return simd_func_version::AVX2;
if (version_str == "AVX512")
return simd_func_version::AVX512;
// ARM SIMD版本检查
if (version_str == "NEON")
return simd_func_version::NEON;
if (version_str == "NEON_FP16")
return simd_func_version::NEON_FP16;
// RISC-V向量扩展检查
if (version_str == "VECTOR")
return simd_func_version::VECTOR;
// 无法识别的字符串回退到SCALAR版本
// 这提供了最基本的兼容性保证
// 可能的原因:
// - 拼写错误:"avx2"小写)、"AVX 2"(有空格)
// - 不支持的版本名称:"AVX10"、"SSE5"
// - 空字符串或格式错误的输入
//
// 注意:这里静默回退可能掩盖配置错误
// 建议:调用者在关键场景应添加日志记录
return simd_func_version::SCALAR; // 默认返回SCALAR
}

View File

@@ -1,593 +0,0 @@
/**
* @file simd_func_dispatcher.h
* @brief SIMD函数调度器 - 运行时选择最优SIMD实现的核心框架
*
* 本模块提供了一个强大而灵活的SIMD函数调度系统能够
* - 在运行时根据CPU特性自动选择最优的SIMD实现
* - 支持多版本函数标量、SSE、AVX、NEON等的统一管理
* - 提供类型安全的函数注册和调用机制
* - 自动回退到兼容性更好的实现版本
*
* 核心设计思想:
* 1. 多版本实现每个函数可以有多个针对不同SIMD级别的优化版本
* 2. 运行时选择程序启动时检测CPU特性选择最佳版本
* 3. 透明调用:用户调用时无需关心具体使用哪个版本
* 4. 类型安全使用模板和std::function确保类型匹配
*
* 工作流程:
* ```
* [注册阶段]
* 1. 为每个函数注册多个SIMD版本的实现
* 2. 调度器存储所有版本并根据CPU能力选择最优版本
*
* [调用阶段]
* 3. 用户调用函数时,调度器自动使用预选的最优版本
* 4. 如果最优版本不可用,自动回退到次优版本
* ```
*
* 使用示例:
* @code
* // 注册函数的多个版本
* REGISTER_SIMD_FUNCTION("process_audio", simd_func_version::SCALAR, scalar_impl);
* REGISTER_SIMD_FUNCTION("process_audio", simd_func_version::AVX2, avx2_impl);
*
* // 获取并调用最优版本
* auto& func = GET_SIMD_FUNCTION(void(float*, size_t), "process_audio");
* func(data, count); // 自动使用AVX2版本如果CPU支持
* @endcode
*
* @note 这是整个SIMD优化框架的核心组件
* @see cpu_feature_detector, multi_version_func
*/
#pragma once
#include <algorithm>
#include <array>
#include <functional>
#include <memory>
#include <stdexcept>
#include <unordered_map>
#include "cpu_features.h"
/**
* @enum simd_func_version
* @brief SIMD函数版本枚举 - 定义所有可能的函数实现版本
*
* 该枚举定义了函数可以有的所有SIMD优化版本。
* 每个版本对应一个特定的SIMD指令集级别。
*
* 版本排序:
* - 枚举值从低到高表示性能从弱到强
* - SCALAR是最基础的版本所有CPU都支持
* - COUNT用于数组大小不是实际版本
*
* 与simd_level的关系
* - simd_level表示CPU的能力级别
* - simd_func_version表示函数的实现版本
* - 通过simd_level_to_version()进行转换
*
* @note 不是所有函数都需要实现所有版本
* @see simd_level, simd_level_to_version()
*/
enum class simd_func_version {
/** 标量实现 - 纯C++代码无SIMD优化
* - 兼容性所有CPU
* - 性能基准性能1x
* - 用途:最低保底实现、参考实现
* - 必要性:强制要求,作为回退版本
*/
SCALAR = 0,
/** SSE实现 - 使用SSE/SSE2指令
* - 兼容性2003年后的所有x86/x64
* - 向量宽度128位
* - 性能提升约2-4倍
*/
SSE,
/** SSE3实现 - 使用SSE3/SSSE3指令
* - 兼容性2006年后的主流CPU
* - 新增功能:水平运算、复数支持
* - 性能提升比SSE快10-20%
*/
SSE3,
/** SSE4实现 - 使用SSE4.1/SSE4.2指令
* - 兼容性2008年后的主流CPU
* - 新增功能点积、blend、字符串处理
* - 性能提升比SSE3快15-30%
*/
SSE4,
/** AVX实现 - 使用AVX指令
* - 兼容性2011年后的主流CPU
* - 向量宽度256位
* - 性能提升约2倍SSE4性能
*/
AVX,
/** AVX2实现 - 使用AVX2 + FMA指令
* - 兼容性2013年后的主流CPU
* - 新增功能完整256位整数运算、FMA
* - 性能提升比AVX快50-100%
* - 推荐:当前最佳性能/兼容性平衡点
*/
AVX2,
/** AVX-512实现 - 使用AVX-512指令集
* - 兼容性2016年后的高端CPU
* - 向量宽度512位
* - 性能提升约2倍AVX2性能理论
* - 注意可能导致CPU降频
*/
AVX512,
/** NEON实现 - 使用ARM NEON指令
* - 兼容性所有ARMv8-A (64位ARM)
* - 向量宽度128位
* - 性能与SSE4相当
* - 应用移动设备、Apple Silicon
*/
NEON,
/** NEON + FP16实现 - 使用NEON半精度浮点
* - 兼容性ARMv8.2-A及更新
* - 新增硬件FP16运算
* - 性能FP16运算快2倍
* - 应用移动端AI推理
*/
NEON_FP16,
/** RISC-V向量扩展实现
* - 兼容性支持RVV的RISC-V处理器
* - 特点:可变向量长度
* - 应用嵌入式、IoT
*/
VECTOR,
/** 版本数量标记
* 用于数组大小定义,不是实际的函数版本
*/
COUNT
};
/**
* @brief 将SIMD级别转换为函数版本
* @param level CPU的SIMD级别
* @return 对应的函数版本枚举值
*
* 将cpu_feature_detector检测到的SIMD级别转换为
* 函数调度器使用的版本标识。
*
* 映射关系:
* - simd_level::NONE -> simd_func_version::SCALAR
* - simd_level::SSE -> simd_func_version::SSE
* - simd_level::AVX2 -> simd_func_version::AVX2
* - 等等...
*
* @note constexpr函数编译时求值零运行时开销
* @see simd_level, simd_func_version
*/
constexpr auto simd_level_to_version(simd_level level) {
switch (level) {
case simd_level::NONE:
return simd_func_version::SCALAR;
case simd_level::SSE:
return simd_func_version::SSE;
case simd_level::SSE3:
return simd_func_version::SSE3;
case simd_level::SSE4:
return simd_func_version::SSE4;
case simd_level::AVX:
return simd_func_version::AVX;
case simd_level::AVX2:
return simd_func_version::AVX2;
case simd_level::AVX512:
return simd_func_version::AVX512;
case simd_level::NEON:
return simd_func_version::NEON;
case simd_level::NEON_FP16:
return simd_func_version::NEON_FP16;
}
// 默认回退到标量版本
return simd_func_version::SCALAR;
}
// 前向声明
template <typename func_signature>
class multi_version_func;
/**
* @class multi_version_func
* @brief 多版本函数容器 - 管理同一函数的多个SIMD优化版本
* @tparam return_type 函数返回类型
* @tparam args 函数参数类型列表
*
* 该类模板存储和管理一个函数的所有SIMD版本实现
* 并能根据CPU能力自动选择最优版本。
*
* 核心功能:
* 1. 存储多个版本为每个SIMD级别存储一个函数实现
* 2. 自动选择根据CPU能力选择最优可用版本
* 3. 智能回退:如果最优版本不可用,自动使用次优版本
* 4. 类型安全使用std::function确保所有版本签名一致
*
* 选择策略:
* - 获取推荐的SIMD级别考虑性能和兼容性
* - 从推荐级别开始,向下查找第一个可用的实现
* - 如果都不可用,抛出异常
*
* 使用示例:
* @code
* multi_version_func<void(float*, size_t)> func;
* func.register_version(simd_func_version::SCALAR, scalar_impl);
* func.register_version(simd_func_version::AVX2, avx2_impl);
*
* // 自动选择最优版本并调用
* func(data, count);
* @endcode
*
* @note 通常不直接使用而是通过simd_func_dispatcher管理
* @see simd_func_dispatcher
*/
template <typename return_type, typename... args>
class multi_version_func<return_type(args...)> {
public:
/// 函数类型定义
using func_type = std::function<return_type(args...)>;
/// 函数数组类型(存储所有版本)
using func_arr = std::array<func_type, static_cast<size_t>(simd_func_version::COUNT)>;
/**
* @brief 默认构造函数
*
* 创建空的多版本函数容器所有版本槽位初始化为nullptr
*/
multi_version_func() = default;
/**
* @brief 注册函数的特定版本实现
* @param version SIMD版本标识
* @param func 该版本的函数实现
*
* 将一个函数实现注册到指定的SIMD版本槽位。
* 如果该槽位已有实现,会被覆盖。
* 注册后会自动更新最佳函数选择。
*
* @note 使用std::move避免不必要的拷贝
*/
void register_version(simd_func_version version, func_type func) {
functions_[static_cast<size_t>(version)] = std::move(func);
best_func_ = get_best_func(); // 更新最佳函数
}
/**
* @brief 获取当前最佳的函数实现
* @return 最优函数的引用
*
* 根据以下策略选择最佳函数:
* 1. 获取推荐的SIMD级别考虑CPU特性和性能
* 2. 转换为函数版本枚举
* 3. 从推荐版本开始向下查找第一个可用的实现
* 4. 如果都不可用,返回空函数指针
*
* 回退顺序示例假设推荐AVX2
* AVX2 -> AVX -> SSE4 -> SSE3 -> SSE -> SCALAR
*
* @note 该函数会被缓存到best_func_成员避免重复查找
*/
const auto& get_best_func() const {
const auto recommended_level = get_recommended_simd_level();
const auto referred_version = simd_level_to_version(recommended_level);
// 从首选版本开始,向下查找可用的实现
for (int v = static_cast<int>(referred_version); v >= 0; --v) {
auto version = static_cast<simd_func_version>(v);
if (const auto& func = functions_[static_cast<size_t>(version)]) { return func; }
}
// 如果没有找到任何实现,返回一个空函数
static const func_type empty_func = nullptr;
return empty_func;
}
/**
* @brief 函数调用运算符 - 执行最优版本的函数
* @param in_args 转发给函数的参数
* @return 函数执行结果
* @throws std::runtime_error 如果没有可用的实现
*
* 自动选择并调用最佳版本的函数实现。
* 参数会被完美转发到实际的函数。
*
* @note 这使得multi_version_func对象可以像普通函数一样调用
*/
auto operator()(args... in_args) const {
if (!best_func_) {
throw std::runtime_error("没有可用的SIMD实现。");
}
return best_func_(std::forward<args>(in_args)...);
}
/**
* @brief 检查是否有任何版本的实现
* @return true表示至少有一个版本已注册
*
* 用于验证函数是否已正确注册。
*/
auto has_implementation() const {
return std::any_of(functions_.begin(), functions_.end(), [](const auto& func) { return func != nullptr; });
}
/**
* @brief 获取所有已注册版本的列表
* @return 已注册的版本枚举值向量
*
* 用于调试和状态查询,列出该函数有哪些版本的实现。
*
* 示例输出:[SCALAR, SSE, AVX2]
*/
auto get_available_versions() const {
std::vector<simd_func_version> available_versions;
for (size_t i = 0; i < functions_.size(); ++i) {
if (functions_[i]) { available_versions.push_back(static_cast<simd_func_version>(i)); }
}
return available_versions;
}
private:
func_arr functions_{}; ///< 所有版本的函数数组
func_type best_func_{nullptr}; ///< 缓存的最佳函数(性能优化)
};
/**
* @class simd_func_dispatcher
* @brief SIMD函数调度器 - 全局函数注册和调度中心
*
* 这是整个SIMD优化框架的核心类采用单例模式管理所有SIMD优化函数。
*
* 主要职责:
* 1. 函数注册:接受多版本函数的注册
* 2. 函数存储:使用类型擦除技术统一管理不同签名的函数
* 3. 函数查询:根据名称和签名获取最优版本的函数
* 4. 函数调用:提供便捷的调用接口
* 5. 状态查询:列出所有已注册的函数及其版本
*
* 设计特点:
* - 单例模式:全局唯一实例,集中管理
* - 类型安全:模板确保函数签名匹配
* - 类型擦除:不同签名的函数可以存储在同一容器中
* - 延迟绑定运行时根据CPU特性选择最优版本
*
* 典型工作流程:
* ```
* [初始化阶段]
* 1. 程序启动时各模块注册自己的SIMD函数
* REGISTER_SIMD_FUNCTION("mix_audio", SCALAR, scalar_mix);
* REGISTER_SIMD_FUNCTION("mix_audio", AVX2, avx2_mix);
*
* [运行阶段]
* 2. 代码中获取并调用函数
* auto& mix = GET_SIMD_FUNCTION(void(float*, float*, float*, size_t), "mix_audio");
* mix(src1, src2, dst, count); // 自动使用AVX2版本
* ```
*
* @note 通常通过宏REGISTER_SIMD_FUNCTION、GET_SIMD_FUNCTION使用
* @see multi_version_func, lazy_singleton
*/
class simd_func_dispatcher : public lazy_singleton<simd_func_dispatcher> {
public:
friend class lazy_singleton<simd_func_dispatcher>;
/**
* @brief 注册函数的特定版本实现
* @tparam func_signature 函数签名类型如void(float*, size_t)
* @param func_name 函数名称(字符串标识)
* @param version SIMD版本标识
* @param func 该版本的函数实现
*
* 将一个函数的特定SIMD版本注册到调度器。
* 如果该函数名第一次出现,会自动创建多版本函数容器。
* 如果该版本已存在,会被新实现覆盖。
*
* @note 推荐使用REGISTER_SIMD_FUNCTION宏而不是直接调用
*/
template <typename func_signature>
void register_function(const std::string& func_name,
simd_func_version version,
std::function<func_signature> func) {
auto& holder = get_or_create_func<func_signature>(func_name);
holder.register_version(version, std::move(func));
}
/**
* @brief 获取函数的最优版本
* @tparam func_signature 函数签名类型
* @param func_name 函数名称
* @return 多版本函数对象的引用
* @throws std::runtime_error 如果函数未注册
*
* @note 推荐使用GET_SIMD_FUNCTION宏
*/
template <typename func_signature>
const auto& get_function(const std::string& func_name) const {
const auto& it = func_registry_.find(func_name);
if (it == func_registry_.end()) {
throw std::runtime_error("函数 '" + func_name + "' 未注册");
}
auto* holder = static_cast<func_holder<func_signature>*>(it->second.get());
return holder->func;
}
/**
* @brief 直接调用函数(便捷接口)
* @tparam func_signature 函数签名类型
* @tparam args 参数类型包
* @param func_name 函数名称
* @param in_args 转发给函数的参数
* @return 函数执行结果
* @throws std::runtime_error 如果函数未注册或无可用实现
*
* @note 推荐使用CALL_SIMD_FUNCTION宏
*/
template <typename func_signature, typename... args>
auto call_function(const std::string& func_name, args&&... in_args) const {
const auto& func = get_function<func_signature>(func_name);
return func(std::forward<args>(in_args)...);
}
/**
* @brief 列出所有已注册的函数名称
* @return 函数名称列表
*/
[[nodiscard]] auto list_functions() const -> std::vector<std::string> {
std::vector<std::string> func_names;
for (const auto& pair : func_registry_) { func_names.push_back(pair.first); }
return func_names;
}
/**
* @brief 打印所有函数的注册状态
* @see simd_func_dispatcher.cpp 实现在cpp文件中
*/
void print_registry_status() const;
private:
/** 函数持有者基类 - 类型擦除的基础 */
struct func_holder_base {
virtual ~func_holder_base() = default;
[[nodiscard]] virtual auto get_available_versions() const -> std::vector<simd_func_version> = 0;
[[nodiscard]] virtual auto has_implementation() const -> bool = 0;
};
/** 具体的函数持有者模板 */
template <typename func_signature>
struct func_holder : func_holder_base {
multi_version_func<func_signature> func;
[[nodiscard]] auto get_available_versions() const -> std::vector<simd_func_version> override {
return func.get_available_versions();
}
[[nodiscard]] auto has_implementation() const -> bool override { return func.has_implementation(); }
};
/** 获取或创建函数持有者(内部辅助函数) */
template <typename func_signature>
auto& get_or_create_func(const std::string& func_name) {
const auto& it = func_registry_.find(func_name);
if (it != func_registry_.end()) {
auto* holder = static_cast<func_holder<func_signature>*>(it->second.get());
return holder->func;
}
auto holder = std::make_unique<func_holder<func_signature>>();
auto* ptr = holder.get();
func_registry_[func_name] = std::move(holder);
return ptr->func;
}
/// 函数注册表:函数名 -> 函数持有者的映射
std::unordered_map<std::string, std::unique_ptr<func_holder_base>> func_registry_{};
};
/**
* @def REGISTER_SIMD_FUNCTION
* @brief 注册SIMD函数宏 - 便捷的函数注册接口
* @param func_name 函数名称(字符串字面量)
* @param version SIMD版本枚举值
* @param func 函数指针或可调用对象
*
* 简化函数注册的便捷宏,自动推导函数签名并注册。
*
* 示例:
* @code
* REGISTER_SIMD_FUNCTION("process", simd_func_version::SCALAR, scalar_process);
* REGISTER_SIMD_FUNCTION("process", simd_func_version::AVX2, avx2_process);
* @endcode
*/
#define REGISTER_SIMD_FUNCTION(func_name, version, func) \
simd_func_dispatcher::instance().register_function(func_name, version, std::function(func));
/**
* @def GET_SIMD_FUNCTION
* @brief 获取SIMD函数宏 - 便捷的函数获取接口
* @param func_signature 函数签名类型
* @param func_name 函数名称
* @return 多版本函数对象的引用
*
* 示例:
* @code
* auto& process = GET_SIMD_FUNCTION(void(float*, size_t), "process");
* process(data, count);
* @endcode
*/
#define GET_SIMD_FUNCTION(func_signature, func_name) \
simd_func_dispatcher::instance().get_function<func_signature>(func_name);
/**
* @def CALL_SIMD_FUNCTION
* @brief 调用SIMD函数宏 - 便捷的函数调用接口
* @param func_signature 函数签名类型
* @param func_name 函数名称
* @param ... 函数参数
*
* 示例:
* @code
* CALL_SIMD_FUNCTION(void(float*, size_t), "process", data, count);
* @endcode
*/
#define CALL_SIMD_FUNCTION(func_signature, func_name, ...) \
simd_func_dispatcher::instance().call_function<func_signature>(func_name, __VA_ARGS__);
/**
* @class simd_auto_register
* @brief SIMD自动注册助手 - 利用静态初始化自动注册函数
* @tparam func_signature 函数签名类型
*
* 该类利用C++的静态初始化机制,在程序启动时自动注册函数。
* 通常不直接使用而是通过AUTO_REGISTER_SIMD_FUNCTION宏。
*
* @see AUTO_REGISTER_SIMD_FUNCTION
*/
template <typename func_signature>
class simd_auto_register {
public:
simd_auto_register(const std::string& func_name, simd_func_version version, std::function<func_signature> func) {
simd_func_dispatcher::instance().register_function<func_signature>(func_name, version, std::move(func));
}
};
/**
* @brief 将SIMD函数版本枚举转换为字符串
* @param version 函数版本枚举值
* @return 本名称字符串
*
* 用于调试输出和日志记录。
*
* 示例:
* @code
* const char* name = simd_func_version_to_string(simd_func_version::AVX2);
* // name = "AVX2"
* @endcode
*
* @see simd_func_dispatcher.cpp
*/
const char* simd_func_version_to_string(simd_func_version version);
/**
* @brief 将字符串转换为SIMD函数版本枚举
* @param version_str 版本名称字符串
* @return 对应的函数版本枚举值
*
* 字符串不匹配时返回simd_func_version::SCALAR。
*
* 示例:
* @code
* auto version = string_to_simd_func_version("AVX2");
* // version = simd_func_version::AVX2
* @endcode
*
* @see simd_func_dispatcher.cpp
*/
simd_func_version string_to_simd_func_version(const std::string& version_str);

View File

@@ -0,0 +1,4 @@
project(alicho_simd_interface)
simple_library(INTERFACE)
target_compile_definitions(${PROJECT_NAME} INTERFACE SIMD_INTERFACE_EXPORTS)

View File

@@ -29,6 +29,7 @@
#include <new>
#include <cstddef>
#include <stdexcept> // ASSERT_ALIGNED宏需要
#include <numeric>
/**
* @defgroup alignment_constants
@@ -385,29 +386,23 @@ bool operator!=(const aligned_allocator<T1, A1>&, const aligned_allocator<T2, A2
/** SSE对齐分配器 (16字节对齐)
* SSE/SSE2指令优化的代码
* @tparam T
*/
template <typename T>
using sse_aligned_allocator = aligned_allocator<T, ALIGNMENT_SSE>;
using sse_aligned_allocator = aligned_allocator<float, ALIGNMENT_SSE>;
/** AVX对齐分配器 (32字节对齐)
* AVX/AVX2指令优化的代码
* @tparam T
*
*
* @code
* std::vector<float, avx_aligned_allocator<float>> data(1024);
* @endcode
*/
template <typename T>
using avx_aligned_allocator = aligned_allocator<T, ALIGNMENT_AVX>;
using avx_aligned_allocator = aligned_allocator<float, ALIGNMENT_AVX>;
/** AVX-512对齐分配器 (64字节对齐)
* AVX-512
* @tparam T
*/
template <typename T>
using avx512_aligned_allocator = aligned_allocator<T, ALIGNMENT_AVX512>;
using avx512_aligned_allocator = aligned_allocator<float, ALIGNMENT_AVX512>;
/** 缓存行对齐分配器 (64字节对齐)
* false sharing线

View File

@@ -0,0 +1,17 @@
#pragma once
#if defined(_MSC_VER)
#ifdef SIMD_INTERFACE_EXPORTS
#define SIMD_EXPORT __declspec(dllexport)
#else
#define SIMD_EXPORT __declspec(dllimport)
#endif
#elif defined(__GNUC__)
#ifdef SIMD_INTERFACE_EXPORTS
#define SIMD_EXPORT __attribute__((visibility("default")))
#else
#define SIMD_EXPORT
#endif
#else
#define SIMD_EXPORT
#endif

View File

@@ -0,0 +1 @@
#include "simd_interface.h"

View File

@@ -0,0 +1,17 @@
#pragma once
#include "simd_export.h"
#include <cstddef>
extern "C" {
SIMD_EXPORT void fill_buffer(float* buffer, float value, size_t num_samples);
SIMD_EXPORT void mix_audio(const float* src1, const float* src2, float* dst, size_t num_samples);
SIMD_EXPORT void apply_gain(const float* src, float* dst, float gain, size_t num_samples);
SIMD_EXPORT float calculate_rms(const float* src, size_t num_samples);
SIMD_EXPORT float calculate_peak(const float* src, size_t num_samples);
SIMD_EXPORT void normalize_audio(const float* src, float* dst, float target_peak, size_t num_samples);
SIMD_EXPORT void stereo_to_mono(const float* stereo_src, float* mono_dst, size_t num_stereo_samples);
SIMD_EXPORT void limit_audio(const float* src, float* dst, float threshold, float* limiter_state, float sample_rate, size_t num_samples);
SIMD_EXPORT void fade_audio(const float* src, float* dst, size_t fade_in_samples, size_t fade_out_samples, size_t num_samples);
SIMD_EXPORT void simple_eq(const float* src, float* dst, float low_gain, float mid_gain, float high_gain, float* eq_state, size_t num_samples);
}

View File

@@ -0,0 +1,4 @@
project(alicho_simd_scaler)
simple_library(SHARED)
target_link_libraries(${PROJECT_NAME} PUBLIC alicho_simd_interface)

View File

@@ -0,0 +1,179 @@
#include "simd_interface.h"
#include <cmath>
extern "C"
{
SIMD_EXPORT void fill_buffer(float *buffer, float value, size_t num_samples)
{
for (size_t i = 0; i < num_samples; ++i)
{
buffer[i] = value;
}
}
SIMD_EXPORT void mix_audio(const float *src1, const float *src2, float *dst, size_t num_samples)
{
for (size_t i = 0; i < num_samples; ++i)
{
dst[i] = src1[i] + src2[i];
}
}
SIMD_EXPORT void apply_gain(const float *src, float *dst, float gain, size_t num_samples)
{
for (size_t i = 0; i < num_samples; ++i)
{
dst[i] = src[i] * gain;
}
}
SIMD_EXPORT float calculate_rms(const float *src, size_t num_samples)
{
float sum_squares = 0.0f;
for (size_t i = 0; i < num_samples; ++i)
{
sum_squares += src[i] * src[i];
}
return std::sqrt(sum_squares / static_cast<float>(num_samples));
}
SIMD_EXPORT float calculate_peak(const float *src, size_t num_samples)
{
float peak = 0.0f;
for (size_t i = 0; i < num_samples; ++i)
{
float abs_sample = std::fabs(src[i]);
if (abs_sample > peak)
{
peak = abs_sample;
}
}
return peak;
}
SIMD_EXPORT void normalize_audio(const float *src, float *dst, float target_peak, size_t num_samples)
{
if (num_samples == 0 || target_peak <= 0.0f)
{
return;
}
const float current_peak = calculate_peak(src, num_samples);
if (current_peak < 1e-10f)
{
for (size_t i = 0; i < num_samples; ++i)
{
dst[i] = 0.0f;
}
return;
}
const float gain_factor = target_peak / current_peak;
apply_gain(src, dst, gain_factor, num_samples);
}
SIMD_EXPORT void stereo_to_mono(const float *stereo_src, float *mono_dst, size_t num_stereo_samples)
{
for (size_t i = 0; i < num_stereo_samples; i += 2)
{
mono_dst[i / 2] = (stereo_src[i] + stereo_src[i + 1]) * 0.5f;
}
}
SIMD_EXPORT void limit_audio(const float *src, float *dst, float threshold, float *limiter_state, float sample_rate,
size_t num_samples)
{
if (num_samples == 0 || threshold <= 0.0f)
{
return;
}
constexpr float release_time = 0.05f;
float release_coeff = std::exp(-1.0f / (release_time * sample_rate));
float current_gain = limiter_state != nullptr ? *limiter_state : 1.0f;
for (size_t i = 0; i < num_samples; ++i)
{
float sample = src[i];
float abs_sample = std::fabs(sample);
float target_gain = abs_sample > threshold ? threshold / abs_sample : 1.0f;
if (target_gain < current_gain)
{
current_gain = target_gain;
}
else
{
current_gain = target_gain + (current_gain - target_gain) * release_coeff;
}
dst[i] = sample * current_gain;
}
}
SIMD_EXPORT void fade_audio(const float *src, float *dst, size_t fade_in_samples, size_t fade_out_samples,
size_t num_samples)
{
if (num_samples == 0)
{
return;
}
size_t i = 0;
if (fade_in_samples > 0)
{
const float fade_in_step = 1.0f / static_cast<float>(fade_in_samples);
for (; i < std::min(fade_in_samples, num_samples); ++i)
{
const float gain = static_cast<float>(i) * fade_in_step;
dst[i] = src[i] * gain;
}
}
if (fade_out_samples > 0)
{
const size_t fade_out_start = num_samples > fade_out_samples ? num_samples - fade_out_samples : 0;
const float fade_out_step = 1.0f / static_cast<float>(fade_out_samples);
for (size_t j = fade_out_start; j < num_samples; ++j)
{
const size_t fade_out_offset = j - fade_out_start;
const float gain = 1.0f - static_cast<float>(fade_out_offset) * fade_out_step;
dst[j] = src[j] * gain;
}
}
}
SIMD_EXPORT void simple_eq(const float *src, float *dst, float low_gain, float mid_gain, float high_gain, float *eq_state, size_t num_samples)
{
if (num_samples == 0)
{
return;
}
float low_pass_state = eq_state != nullptr ? eq_state[0] : 0.0f;
float high_pass_state = eq_state != nullptr ? eq_state[1] : 0.0f;
const float low_cutoff_freq = 200.0f;
const float high_cutoff_freq = 2000.0f;
const float sample_rate = 44100.0f;
const float low_alpha = low_cutoff_freq / (low_cutoff_freq + sample_rate);
const float high_alpha = sample_rate / (high_cutoff_freq + sample_rate);
for (size_t i = 0; i < num_samples; ++i)
{
float sample = src[i];
low_pass_state += low_alpha * (sample - low_pass_state);
float low_freq = low_pass_state;
high_pass_state = high_alpha * (high_pass_state + sample - (i > 0 ? src[i - 1] : 0.0f));
float high_freq = high_pass_state;
float mid_freq = sample - low_freq - high_freq;
dst[i] = low_freq * low_gain + mid_freq * mid_gain + high_freq * high_gain;
}
if (eq_state != nullptr)
{
eq_state[0] = low_pass_state;
eq_state[1] = high_pass_state;
}
}
} // extern "C"

View File

@@ -0,0 +1,7 @@
project(alicho_simd_sse)
if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64")
simple_library(SHARED)
target_compile_options(${PROJECT_NAME} PRIVATE -msse4.2)
target_link_libraries(${PROJECT_NAME} PUBLIC alicho_simd_interface)
endif()

View File

@@ -0,0 +1,699 @@
/**
* @file x86_sse_audio_processing_func.cpp
* @brief x86 SSE音频处理函数实现
*/
#include "simd_interface.h"
#include <cmath>
#include <immintrin.h>
#include "aligned_allocator.h"
extern "C"
{
SIMD_EXPORT void fill_buffer(float *buffer, float value, size_t num_samples)
{
ASSERT_ALIGNED(buffer, ALIGNMENT_SSE);
constexpr size_t simd_width = 4;
constexpr size_t unroll_factor = 4;
size_t i = 0;
auto value_vec = _mm_set1_ps(value);
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
{
_mm_store_ps(&buffer[i], value_vec);
_mm_store_ps(&buffer[i + 4], value_vec);
_mm_store_ps(&buffer[i + 8], value_vec);
_mm_store_ps(&buffer[i + 12], value_vec);
}
for (; i + simd_width <= num_samples; i += simd_width)
{
_mm_store_ps(&buffer[i], value_vec);
}
for (; i < num_samples; ++i)
{
buffer[i] = value;
}
}
SIMD_EXPORT void mix_audio(const float *src1, const float *src2, float *dst, size_t num_samples)
{
ASSERT_ALIGNED(src1, ALIGNMENT_SSE);
ASSERT_ALIGNED(src2, ALIGNMENT_SSE);
ASSERT_ALIGNED(dst, ALIGNMENT_SSE);
constexpr size_t simd_width = 4;
constexpr size_t unroll_factor = 4;
size_t i = 0;
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
{
auto a0 = _mm_load_ps(&src1[i]);
auto a1 = _mm_load_ps(&src1[i + 4]);
auto a2 = _mm_load_ps(&src1[i + 8]);
auto a3 = _mm_load_ps(&src1[i + 12]);
auto b0 = _mm_load_ps(&src2[i]);
auto b1 = _mm_load_ps(&src2[i + 4]);
auto b2 = _mm_load_ps(&src2[i + 8]);
auto b3 = _mm_load_ps(&src2[i + 12]);
auto result0 = _mm_add_ps(a0, b0);
auto result1 = _mm_add_ps(a1, b1);
auto result2 = _mm_add_ps(a2, b2);
auto result3 = _mm_add_ps(a3, b3);
_mm_store_ps(&dst[i], result0);
_mm_store_ps(&dst[i + 4], result1);
_mm_store_ps(&dst[i + 8], result2);
_mm_store_ps(&dst[i + 12], result3);
}
for (; i + simd_width <= num_samples; i += simd_width)
{
auto a = _mm_load_ps(&src1[i]);
auto b = _mm_load_ps(&src2[i]);
auto result = _mm_add_ps(a, b);
_mm_store_ps(&dst[i], result);
}
for (; i < num_samples; ++i)
{
dst[i] = src1[i] + src2[i];
}
}
SIMD_EXPORT void apply_gain(const float *src, float *dst, float gain, size_t num_samples)
{
ASSERT_ALIGNED(src, ALIGNMENT_SSE);
ASSERT_ALIGNED(dst, ALIGNMENT_SSE);
constexpr size_t simd_width = 4;
constexpr size_t unroll_factor = 4;
size_t i = 0;
auto gain_vec = _mm_set1_ps(gain);
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
{
auto a0 = _mm_load_ps(&src[i]);
auto a1 = _mm_load_ps(&src[i + 4]);
auto a2 = _mm_load_ps(&src[i + 8]);
auto a3 = _mm_load_ps(&src[i + 12]);
auto result0 = _mm_mul_ps(a0, gain_vec);
auto result1 = _mm_mul_ps(a1, gain_vec);
auto result2 = _mm_mul_ps(a2, gain_vec);
auto result3 = _mm_mul_ps(a3, gain_vec);
_mm_store_ps(&dst[i], result0);
_mm_store_ps(&dst[i + 4], result1);
_mm_store_ps(&dst[i + 8], result2);
_mm_store_ps(&dst[i + 12], result3);
}
for (; i + simd_width <= num_samples; i += simd_width)
{
auto a = _mm_load_ps(&src[i]);
auto result = _mm_mul_ps(a, gain_vec);
_mm_store_ps(&dst[i], result);
}
for (; i < num_samples; ++i)
{
dst[i] = src[i] * gain;
}
}
SIMD_EXPORT float calculate_rms(const float *src, size_t num_samples)
{
ASSERT_ALIGNED(src, ALIGNMENT_SSE);
constexpr size_t simd_width = 4;
constexpr size_t unroll_factor = 4;
size_t i = 0;
auto sum_squares0 = _mm_setzero_ps();
auto sum_squares1 = _mm_setzero_ps();
auto sum_squares2 = _mm_setzero_ps();
auto sum_squares3 = _mm_setzero_ps();
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
{
const auto a0 = _mm_load_ps(&src[i]);
const auto a1 = _mm_load_ps(&src[i + 4]);
const auto a2 = _mm_load_ps(&src[i + 8]);
const auto a3 = _mm_load_ps(&src[i + 12]);
const auto squared0 = _mm_mul_ps(a0, a0);
const auto squared1 = _mm_mul_ps(a1, a1);
const auto squared2 = _mm_mul_ps(a2, a2);
const auto squared3 = _mm_mul_ps(a3, a3);
sum_squares0 = _mm_add_ps(sum_squares0, squared0);
sum_squares1 = _mm_add_ps(sum_squares1, squared1);
sum_squares2 = _mm_add_ps(sum_squares2, squared2);
sum_squares3 = _mm_add_ps(sum_squares3, squared3);
}
auto sum_squares = _mm_add_ps(_mm_add_ps(sum_squares0, sum_squares1),
_mm_add_ps(sum_squares2, sum_squares3));
for (; i + simd_width <= num_samples; i += simd_width)
{
const auto a = _mm_load_ps(&src[i]);
const auto squared = _mm_mul_ps(a, a);
sum_squares = _mm_add_ps(sum_squares, squared);
}
auto hadd1 = _mm_hadd_ps(sum_squares, sum_squares);
auto hadd2 = _mm_hadd_ps(hadd1, hadd1);
double total_sum = _mm_cvtss_f32(hadd2);
for (; i < num_samples; ++i)
{
total_sum += static_cast<double>(src[i]) * static_cast<double>(src[i]);
}
return static_cast<float>(std::sqrt(total_sum / static_cast<double>(num_samples)));
}
SIMD_EXPORT float calculate_peak(const float *src, size_t num_samples)
{
ASSERT_ALIGNED(src, ALIGNMENT_SSE);
constexpr size_t simd_width = 4;
constexpr size_t unroll_factor = 4;
size_t i = 0;
auto peak_vec0 = _mm_setzero_ps();
auto peak_vec1 = _mm_setzero_ps();
auto peak_vec2 = _mm_setzero_ps();
auto peak_vec3 = _mm_setzero_ps();
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
{
const auto a0 = _mm_load_ps(&src[i]);
const auto a1 = _mm_load_ps(&src[i + 4]);
const auto a2 = _mm_load_ps(&src[i + 8]);
const auto a3 = _mm_load_ps(&src[i + 12]);
const auto abs_a0 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a0);
const auto abs_a1 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a1);
const auto abs_a2 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a2);
const auto abs_a3 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a3);
peak_vec0 = _mm_max_ps(peak_vec0, abs_a0);
peak_vec1 = _mm_max_ps(peak_vec1, abs_a1);
peak_vec2 = _mm_max_ps(peak_vec2, abs_a2);
peak_vec3 = _mm_max_ps(peak_vec3, abs_a3);
}
auto peak_vec = _mm_max_ps(_mm_max_ps(peak_vec0, peak_vec1),
_mm_max_ps(peak_vec2, peak_vec3));
for (; i + simd_width <= num_samples; i += simd_width)
{
const auto a = _mm_load_ps(&src[i]);
const auto abs_a = _mm_andnot_ps(_mm_set1_ps(-0.0f), a);
peak_vec = _mm_max_ps(peak_vec, abs_a);
}
auto temp1 = _mm_shuffle_ps(peak_vec, peak_vec, _MM_SHUFFLE(2, 3, 0, 1));
auto max1 = _mm_max_ps(peak_vec, temp1);
auto temp2 = _mm_shuffle_ps(max1, max1, _MM_SHUFFLE(1, 0, 3, 2));
auto final_max = _mm_max_ps(max1, temp2);
float peak = _mm_cvtss_f32(final_max);
for (; i < num_samples; ++i)
{
float abs_sample = std::fabs(src[i]);
if (abs_sample > peak)
{
peak = abs_sample;
}
}
return peak;
}
SIMD_EXPORT void normalize_audio(const float *src, float *dst, float target_peak, size_t num_samples)
{
ASSERT_ALIGNED(src, ALIGNMENT_SSE);
ASSERT_ALIGNED(dst, ALIGNMENT_SSE);
if (num_samples == 0 || target_peak <= 0.0f)
{
return;
}
const float current_peak = calculate_peak(src, num_samples);
if (current_peak < 1e-10f)
{
constexpr size_t simd_width = 4;
constexpr size_t unroll_factor = 4;
auto zero_vec = _mm_setzero_ps();
size_t i = 0;
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
{
_mm_store_ps(&dst[i], zero_vec);
_mm_store_ps(&dst[i + 4], zero_vec);
_mm_store_ps(&dst[i + 8], zero_vec);
_mm_store_ps(&dst[i + 12], zero_vec);
}
for (; i + simd_width <= num_samples; i += simd_width)
{
_mm_store_ps(&dst[i], zero_vec);
}
for (; i < num_samples; ++i)
{
dst[i] = 0.0f;
}
return;
}
const float gain_factor = target_peak / current_peak;
apply_gain(src, dst, gain_factor, num_samples);
}
SIMD_EXPORT void stereo_to_mono(const float *stereo_src, float *mono_dst, size_t num_stereo_samples)
{
ASSERT_ALIGNED(stereo_src, ALIGNMENT_SSE);
ASSERT_ALIGNED(mono_dst, ALIGNMENT_SSE);
if (num_stereo_samples == 0)
{
return;
}
constexpr size_t simd_width = 4;
constexpr size_t unroll_factor = 4;
const auto half_vec = _mm_set1_ps(0.5f);
size_t stereo_idx = 0;
size_t mono_idx = 0;
for (; stereo_idx + simd_width * 2 * unroll_factor <= num_stereo_samples * 2;
stereo_idx += simd_width * 2 * unroll_factor, mono_idx += simd_width * unroll_factor)
{
auto stereo0 = _mm_load_ps(&stereo_src[stereo_idx]);
auto stereo1 = _mm_load_ps(&stereo_src[stereo_idx + 4]);
auto stereo2 = _mm_load_ps(&stereo_src[stereo_idx + 8]);
auto stereo3 = _mm_load_ps(&stereo_src[stereo_idx + 12]);
auto stereo4 = _mm_load_ps(&stereo_src[stereo_idx + 16]);
auto stereo5 = _mm_load_ps(&stereo_src[stereo_idx + 20]);
auto stereo6 = _mm_load_ps(&stereo_src[stereo_idx + 24]);
auto stereo7 = _mm_load_ps(&stereo_src[stereo_idx + 28]);
auto left0 = _mm_shuffle_ps(stereo0, stereo1, _MM_SHUFFLE(2, 0, 2, 0));
auto right0 = _mm_shuffle_ps(stereo0, stereo1, _MM_SHUFFLE(3, 1, 3, 1));
auto left1 = _mm_shuffle_ps(stereo2, stereo3, _MM_SHUFFLE(2, 0, 2, 0));
auto right1 = _mm_shuffle_ps(stereo2, stereo3, _MM_SHUFFLE(3, 1, 3, 1));
auto left2 = _mm_shuffle_ps(stereo4, stereo5, _MM_SHUFFLE(2, 0, 2, 0));
auto right2 = _mm_shuffle_ps(stereo4, stereo5, _MM_SHUFFLE(3, 1, 3, 1));
auto left3 = _mm_shuffle_ps(stereo6, stereo7, _MM_SHUFFLE(2, 0, 2, 0));
auto right3 = _mm_shuffle_ps(stereo6, stereo7, _MM_SHUFFLE(3, 1, 3, 1));
auto mono0 = _mm_mul_ps(_mm_add_ps(left0, right0), half_vec);
auto mono1 = _mm_mul_ps(_mm_add_ps(left1, right1), half_vec);
auto mono2 = _mm_mul_ps(_mm_add_ps(left2, right2), half_vec);
auto mono3 = _mm_mul_ps(_mm_add_ps(left3, right3), half_vec);
_mm_store_ps(&mono_dst[mono_idx], mono0);
_mm_store_ps(&mono_dst[mono_idx + 4], mono1);
_mm_store_ps(&mono_dst[mono_idx + 8], mono2);
_mm_store_ps(&mono_dst[mono_idx + 12], mono3);
}
for (size_t i = stereo_idx / 2; i < num_stereo_samples; ++i)
{
const float left = stereo_src[i * 2];
const float right = stereo_src[i * 2 + 1];
mono_dst[i] = (left + right) * 0.5f;
}
}
SIMD_EXPORT void limit_audio(const float *src, float *dst, float threshold, float *limiter_state, float sample_rate,
size_t num_samples)
{
ASSERT_ALIGNED(src, ALIGNMENT_SSE);
ASSERT_ALIGNED(dst, ALIGNMENT_SSE);
if (num_samples == 0 || threshold <= 0.0f)
{
return;
}
constexpr size_t simd_width = 4;
constexpr size_t unroll_factor = 4;
constexpr float release_time = 0.05f;
float release_coeff = std::exp(-1.0f / (release_time * sample_rate));
float current_gain = limiter_state != nullptr ? *limiter_state : 1.0f;
size_t i = 0;
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
{
auto a0 = _mm_load_ps(&src[i]);
auto a1 = _mm_load_ps(&src[i + 4]);
auto a2 = _mm_load_ps(&src[i + 8]);
auto a3 = _mm_load_ps(&src[i + 12]);
auto abs_a0 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a0);
auto abs_a1 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a1);
auto abs_a2 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a2);
auto abs_a3 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a3);
auto max_abs = _mm_max_ps(_mm_max_ps(abs_a0, abs_a1),
_mm_max_ps(abs_a2, abs_a3));
auto temp1 = _mm_shuffle_ps(max_abs, max_abs, _MM_SHUFFLE(2, 3, 0, 1));
auto max1 = _mm_max_ps(max_abs, temp1);
auto temp2 = _mm_shuffle_ps(max1, max1, _MM_SHUFFLE(1, 0, 3, 2));
auto final_max = _mm_max_ps(max1, temp2);
float max_sample = _mm_cvtss_f32(final_max);
float target_gain = max_sample > threshold ? threshold / max_sample : 1.0f;
if (target_gain < current_gain)
{
current_gain = target_gain;
}
else
{
current_gain = target_gain + (current_gain - target_gain) * release_coeff;
}
auto gain_vec = _mm_set1_ps(current_gain);
auto result0 = _mm_mul_ps(a0, gain_vec);
auto result1 = _mm_mul_ps(a1, gain_vec);
auto result2 = _mm_mul_ps(a2, gain_vec);
auto result3 = _mm_mul_ps(a3, gain_vec);
_mm_store_ps(&dst[i], result0);
_mm_store_ps(&dst[i + 4], result1);
_mm_store_ps(&dst[i + 8], result2);
_mm_store_ps(&dst[i + 12], result3);
}
for (; i + simd_width <= num_samples; i += simd_width)
{
auto a = _mm_load_ps(&src[i]);
auto abs_a = _mm_andnot_ps(_mm_set1_ps(-0.0f), a);
auto max_abs = abs_a;
auto temp1 = _mm_shuffle_ps(max_abs, max_abs, _MM_SHUFFLE(2, 3, 0, 1));
auto max1 = _mm_max_ps(max_abs, temp1);
auto temp2 = _mm_shuffle_ps(max1, max1, _MM_SHUFFLE(1, 0, 3, 2));
auto final_max = _mm_max_ps(max1, temp2);
float max_sample = _mm_cvtss_f32(final_max);
float target_gain = max_sample > threshold ? threshold / max_sample : 1.0f;
if (target_gain < current_gain)
{
current_gain = target_gain;
}
else
{
current_gain = target_gain + (current_gain - target_gain) * release_coeff;
}
auto gain_vec = _mm_set1_ps(current_gain);
auto result = _mm_mul_ps(a, gain_vec);
_mm_store_ps(&dst[i], result);
}
for (; i < num_samples; ++i)
{
float sample = src[i];
float abs_sample = std::fabs(sample);
float target_gain = abs_sample > threshold ? threshold / abs_sample : 1.0f;
if (target_gain < current_gain)
{
current_gain = target_gain;
}
else
{
current_gain = target_gain + (current_gain - target_gain) * release_coeff;
}
dst[i] = sample * current_gain;
}
if (limiter_state != nullptr)
{
*limiter_state = current_gain;
}
}
SIMD_EXPORT void fade_audio(const float *src, float *dst, size_t fade_in_samples, size_t fade_out_samples,
size_t num_samples)
{
ASSERT_ALIGNED(src, ALIGNMENT_SSE);
ASSERT_ALIGNED(dst, ALIGNMENT_SSE);
if (num_samples == 0)
{
return;
}
constexpr size_t simd_width = 4;
constexpr size_t unroll_factor = 4;
size_t i = 0;
if (fade_in_samples > 0)
{
const float fade_in_step = 1.0f / static_cast<float>(fade_in_samples);
for (; i + simd_width * unroll_factor <= std::min(fade_in_samples, num_samples); i += simd_width *
unroll_factor)
{
auto gain0 = _mm_set_ps((i + 3) * fade_in_step, (i + 2) * fade_in_step, (i + 1) * fade_in_step,
i * fade_in_step);
auto gain1 = _mm_set_ps((i + 7) * fade_in_step, (i + 6) * fade_in_step, (i + 5) * fade_in_step,
(i + 4) * fade_in_step);
auto gain2 = _mm_set_ps((i + 11) * fade_in_step, (i + 10) * fade_in_step, (i + 9) * fade_in_step,
(i + 8) * fade_in_step);
auto gain3 = _mm_set_ps((i + 15) * fade_in_step, (i + 14) * fade_in_step, (i + 13) * fade_in_step,
(i + 12) * fade_in_step);
auto a0 = _mm_load_ps(&src[i]);
auto a1 = _mm_load_ps(&src[i + 4]);
auto a2 = _mm_load_ps(&src[i + 8]);
auto a3 = _mm_load_ps(&src[i + 12]);
auto result0 = _mm_mul_ps(a0, gain0);
auto result1 = _mm_mul_ps(a1, gain1);
auto result2 = _mm_mul_ps(a2, gain2);
auto result3 = _mm_mul_ps(a3, gain3);
_mm_store_ps(&dst[i], result0);
_mm_store_ps(&dst[i + 4], result1);
_mm_store_ps(&dst[i + 8], result2);
_mm_store_ps(&dst[i + 12], result3);
}
for (; i < std::min(fade_in_samples, num_samples); ++i)
{
const float gain = static_cast<float>(i) / static_cast<float>(fade_in_samples);
dst[i] = src[i] * gain;
}
}
const size_t middle_start = fade_in_samples;
const size_t middle_end = num_samples > fade_out_samples ? num_samples - fade_out_samples : 0;
if (middle_end > middle_start)
{
for (size_t j = middle_start; j + simd_width * unroll_factor <= middle_end; j += simd_width *
unroll_factor)
{
auto a0 = _mm_load_ps(&src[j]);
auto a1 = _mm_load_ps(&src[j + 4]);
auto a2 = _mm_load_ps(&src[j + 8]);
auto a3 = _mm_load_ps(&src[j + 12]);
_mm_store_ps(&dst[j], a0);
_mm_store_ps(&dst[j + 4], a1);
_mm_store_ps(&dst[j + 8], a2);
_mm_store_ps(&dst[j + 12], a3);
}
for (size_t j = middle_start + ((middle_end - middle_start) / (simd_width * unroll_factor)) * (simd_width *
unroll_factor);
j < middle_end; ++j)
{
dst[j] = src[j];
}
}
if (fade_out_samples > 0 && num_samples > fade_out_samples)
{
const size_t fade_out_start = num_samples - fade_out_samples;
const float fade_out_step = 1.0f / static_cast<float>(fade_out_samples);
for (size_t j = fade_out_start; j + simd_width * unroll_factor <= num_samples; j += simd_width *
unroll_factor)
{
const size_t fade_out_offset = j - fade_out_start;
auto gain0 = _mm_set_ps(1.0f - (fade_out_offset + 3) * fade_out_step,
1.0f - (fade_out_offset + 2) * fade_out_step,
1.0f - (fade_out_offset + 1) * fade_out_step,
1.0f - fade_out_offset * fade_out_step);
auto gain1 = _mm_set_ps(1.0f - (fade_out_offset + 7) * fade_out_step,
1.0f - (fade_out_offset + 6) * fade_out_step,
1.0f - (fade_out_offset + 5) * fade_out_step,
1.0f - (fade_out_offset + 4) * fade_out_step);
auto gain2 = _mm_set_ps(1.0f - (fade_out_offset + 11) * fade_out_step,
1.0f - (fade_out_offset + 10) * fade_out_step,
1.0f - (fade_out_offset + 9) * fade_out_step,
1.0f - (fade_out_offset + 8) * fade_out_step);
auto gain3 = _mm_set_ps(1.0f - (fade_out_offset + 15) * fade_out_step,
1.0f - (fade_out_offset + 14) * fade_out_step,
1.0f - (fade_out_offset + 13) * fade_out_step,
1.0f - (fade_out_offset + 12) * fade_out_step);
auto a0 = _mm_load_ps(&src[j]);
auto a1 = _mm_load_ps(&src[j + 4]);
auto a2 = _mm_load_ps(&src[j + 8]);
auto a3 = _mm_load_ps(&src[j + 12]);
auto result0 = _mm_mul_ps(a0, gain0);
auto result1 = _mm_mul_ps(a1, gain1);
auto result2 = _mm_mul_ps(a2, gain2);
auto result3 = _mm_mul_ps(a3, gain3);
_mm_store_ps(&dst[j], result0);
_mm_store_ps(&dst[j + 4], result1);
_mm_store_ps(&dst[j + 8], result2);
_mm_store_ps(&dst[j + 12], result3);
}
for (size_t j = fade_out_start + ((fade_out_samples / (simd_width * unroll_factor)) * (simd_width *
unroll_factor));
j < num_samples; ++j)
{
const size_t fade_out_offset = j - fade_out_start;
const float gain = 1.0f - static_cast<float>(fade_out_offset) / static_cast<float>(fade_out_samples);
dst[j] = src[j] * gain;
}
}
}
SIMD_EXPORT void simple_eq(const float *src, float *dst, float low_gain, float mid_gain, float high_gain, float *eq_state,
size_t num_samples)
{
ASSERT_ALIGNED(src, ALIGNMENT_SSE);
ASSERT_ALIGNED(dst, ALIGNMENT_SSE);
if (num_samples == 0)
{
return;
}
constexpr size_t simd_width = 4;
constexpr size_t unroll_factor = 4;
constexpr float low_cutoff = 0.02f;
constexpr float high_cutoff = 0.1f;
constexpr float mid_factor = 0.7f;
float low_state = eq_state != nullptr ? *eq_state : 0.0f;
float high_state = eq_state != nullptr ? *(eq_state + 1) : 0.0f;
const auto low_gain_vec = _mm_set1_ps(low_gain);
const auto mid_gain_vec = _mm_set1_ps(mid_gain);
const auto high_gain_vec = _mm_set1_ps(high_gain);
const auto low_cutoff_vec = _mm_set1_ps(low_cutoff);
const auto high_cutoff_vec = _mm_set1_ps(high_cutoff);
const auto mid_factor_vec = _mm_set1_ps(mid_factor);
const auto one_minus_low_cutoff_vec = _mm_set1_ps(1.0f - low_cutoff);
const auto one_minus_high_cutoff_vec = _mm_set1_ps(1.0f - high_cutoff);
size_t i = 0;
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
{
auto input0 = _mm_load_ps(&src[i]);
auto input1 = _mm_load_ps(&src[i + 4]);
auto input2 = _mm_load_ps(&src[i + 8]);
auto input3 = _mm_load_ps(&src[i + 12]);
auto low_state_vec = _mm_set1_ps(low_state);
auto low0 = _mm_add_ps(_mm_mul_ps(input0, low_cutoff_vec),
_mm_mul_ps(low_state_vec, one_minus_low_cutoff_vec));
auto low1 = _mm_add_ps(_mm_mul_ps(input1, low_cutoff_vec), _mm_mul_ps(low0, one_minus_low_cutoff_vec));
auto low2 = _mm_add_ps(_mm_mul_ps(input2, low_cutoff_vec), _mm_mul_ps(low1, one_minus_low_cutoff_vec));
auto low3 = _mm_add_ps(_mm_mul_ps(input3, low_cutoff_vec), _mm_mul_ps(low2, one_minus_low_cutoff_vec));
auto high0 = _mm_sub_ps(input0, low0);
auto high1 = _mm_sub_ps(input1, low1);
auto high2 = _mm_sub_ps(input2, low2);
auto high3 = _mm_sub_ps(input3, low3);
auto high_state_vec = _mm_set1_ps(high_state);
high0 = _mm_add_ps(_mm_mul_ps(high0, high_cutoff_vec),
_mm_mul_ps(high_state_vec, one_minus_high_cutoff_vec));
high1 = _mm_add_ps(_mm_mul_ps(high1, high_cutoff_vec), _mm_mul_ps(high0, one_minus_high_cutoff_vec));
high2 = _mm_add_ps(_mm_mul_ps(high2, high_cutoff_vec), _mm_mul_ps(high1, one_minus_high_cutoff_vec));
high3 = _mm_add_ps(_mm_mul_ps(high3, high_cutoff_vec), _mm_mul_ps(high2, one_minus_high_cutoff_vec));
auto mid0 = _mm_mul_ps(_mm_sub_ps(_mm_sub_ps(input0, low0), high0), mid_factor_vec);
auto mid1 = _mm_mul_ps(_mm_sub_ps(_mm_sub_ps(input1, low1), high1), mid_factor_vec);
auto mid2 = _mm_mul_ps(_mm_sub_ps(_mm_sub_ps(input2, low2), high2), mid_factor_vec);
auto mid3 = _mm_mul_ps(_mm_sub_ps(_mm_sub_ps(input3, low3), high3), mid_factor_vec);
auto result0 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(low0, low_gain_vec), _mm_mul_ps(mid0, mid_gain_vec)),
_mm_mul_ps(high0, high_gain_vec));
auto result1 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(low1, low_gain_vec), _mm_mul_ps(mid1, mid_gain_vec)),
_mm_mul_ps(high1, high_gain_vec));
auto result2 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(low2, low_gain_vec), _mm_mul_ps(mid2, mid_gain_vec)),
_mm_mul_ps(high2, high_gain_vec));
auto result3 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(low3, low_gain_vec), _mm_mul_ps(mid3, mid_gain_vec)),
_mm_mul_ps(high3, high_gain_vec));
_mm_store_ps(&dst[i], result0);
_mm_store_ps(&dst[i + 4], result1);
_mm_store_ps(&dst[i + 8], result2);
_mm_store_ps(&dst[i + 12], result3);
low_state = _mm_cvtss_f32(_mm_shuffle_ps(low3, low3, _MM_SHUFFLE(3, 3, 3, 3)));
high_state = _mm_cvtss_f32(_mm_shuffle_ps(high3, high3, _MM_SHUFFLE(3, 3, 3, 3)));
}
for (; i < num_samples; ++i)
{
float input = src[i];
float low_output = low_cutoff * input + (1.0f - low_cutoff) * low_state;
low_state = low_output;
float high_input = input - low_output;
float high_output = high_cutoff * high_input + (1.0f - high_cutoff) * high_state;
high_state = high_output;
float mid_output = (input - low_output - high_output) * mid_factor;
dst[i] = low_output * low_gain + mid_output * mid_gain + high_output * high_gain;
}
if (eq_state != nullptr)
{
*eq_state = low_state;
*(eq_state + 1) = high_state;
}
}
}

View File

@@ -10,6 +10,7 @@ enable_testing()
include(cmake/test_helpers.cmake)
# 添加各测试模块
add_subdirectory(helpers)
add_subdirectory(simd)
add_subdirectory(network)
add_subdirectory(shm)

View File

@@ -0,0 +1,4 @@
project(alicho_test_helper)
simple_library(STATIC)
target_link_libraries(${PROJECT_NAME} PUBLIC GTest::gtest GTest::gtest_main audio_backend_project_options)

View File

@@ -7,7 +7,7 @@ add_module_test(
TARGET test_simd_basic
SOURCE_FILE test_simd_basic.cpp
MODULE "SIMD"
LINK_LIBRARIES alicho_simd
LINK_LIBRARIES alicho_simd alicho_simd_interface alicho_test_helper
)
# SIMD 音频处理测试
@@ -15,7 +15,7 @@ add_module_test(
TARGET test_simd_audio_processing
SOURCE_FILE test_simd_audio_processing.cpp
MODULE "SIMD"
LINK_LIBRARIES alicho_simd
LINK_LIBRARIES alicho_simd alicho_simd_interface alicho_test_helper
)
# 自定义目标:运行 SIMD 测试

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff