Add SIMD audio processing interface and implementations
- Created a new SIMD interface header and source files for audio processing functions. - Implemented functions for filling buffers, mixing audio, applying gain, calculating RMS and peak values, normalizing audio, converting stereo to mono, limiting audio, fading audio, and a simple equalizer. - Added SSE-specific implementations for the audio processing functions to leverage SIMD for performance improvements. - Updated CMakeLists.txt files to include new libraries and link dependencies for the SIMD interface and SSE implementations. - Introduced a static test helper library for unit testing with Google Test framework.
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -5,3 +5,5 @@
|
||||
.DS_Store
|
||||
/build/
|
||||
/.vs
|
||||
/out
|
||||
/logs
|
||||
@@ -15,7 +15,6 @@ include(cmake/compiler_options.cmake)
|
||||
include(cmake/mingw_dll.cmake)
|
||||
|
||||
configure_project_defaults()
|
||||
configure_simd_optimizations()
|
||||
setup_project_options(
|
||||
STANDARD 20
|
||||
INTERFACE_TARGET audio_backend_project_options
|
||||
|
||||
@@ -15,6 +15,19 @@
|
||||
"CMAKE_CXX_COMPILER": "cl.exe",
|
||||
"CMAKE_TOOLCHAIN_FILE": "$env{VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "clang",
|
||||
"displayName": "Clang 20.1.8 x86_64-pc-linux-gnu",
|
||||
"description": "正在使用编译器: C = /usr/bin/clang, CXX = /usr/bin/clang++",
|
||||
"binaryDir": "${sourceDir}/out/build/${presetName}",
|
||||
"cacheVariables": {
|
||||
"CMAKE_INSTALL_PREFIX": "${sourceDir}/out/install/${presetName}",
|
||||
"CMAKE_C_COMPILER": "/usr/bin/clang",
|
||||
"CMAKE_CXX_COMPILER": "/usr/bin/clang++",
|
||||
"CMAKE_BUILD_TYPE": "Debug",
|
||||
"CMAKE_TOOLCHAIN_FILE": "$env{VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake"
|
||||
}
|
||||
}
|
||||
],
|
||||
"buildPresets": [
|
||||
|
||||
12
CMakeUserPresets.json
Normal file
12
CMakeUserPresets.json
Normal file
@@ -0,0 +1,12 @@
|
||||
{
|
||||
"version": 3,
|
||||
"configurePresets": [
|
||||
{
|
||||
"name": "clang-local",
|
||||
"inherits": "clang",
|
||||
"environment": {
|
||||
"VCPKG_ROOT": "/home/vcpkg"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -74,83 +74,19 @@ function(configure_compiler_options)
|
||||
message(STATUS "启用彩色诊断和完整模板回溯")
|
||||
endif()
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
# ================================================================================================
|
||||
# 配置SIMD优化(扩展AVX512支持)
|
||||
# ================================================================================================
|
||||
function(configure_simd_optimizations)
|
||||
if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64")
|
||||
# 检测编译器支持
|
||||
include(CheckCXXCompilerFlag)
|
||||
|
||||
# AVX2支持(保持现有)
|
||||
if(MSVC)
|
||||
check_cxx_compiler_flag("/arch:AVX2" COMPILER_SUPPORTS_AVX2)
|
||||
if(COMPILER_SUPPORTS_AVX2)
|
||||
add_compile_options(/arch:AVX2)
|
||||
add_compile_definitions(DAW_ENABLE_AVX2)
|
||||
message(STATUS "SIMD优化: 启用AVX2指令集")
|
||||
endif()
|
||||
else()
|
||||
check_cxx_compiler_flag("-mavx2" COMPILER_SUPPORTS_AVX2)
|
||||
if(COMPILER_SUPPORTS_AVX2)
|
||||
add_compile_options(-mavx2 -mfma)
|
||||
add_compile_definitions(DAW_ENABLE_AVX2)
|
||||
message(STATUS "SIMD优化: 启用AVX2指令集")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# AVX512支持(新增)
|
||||
if(MSVC)
|
||||
check_cxx_compiler_flag("/arch:AVX512" COMPILER_SUPPORTS_AVX512)
|
||||
if(COMPILER_SUPPORTS_AVX512)
|
||||
add_compile_options(/arch:AVX512)
|
||||
add_compile_definitions(DAW_ENABLE_AVX512)
|
||||
message(STATUS "SIMD优化: 启用AVX512指令集")
|
||||
else()
|
||||
message(WARNING "编译器不支持AVX512,降级到AVX2")
|
||||
endif()
|
||||
else()
|
||||
check_cxx_compiler_flag("-mavx512f" COMPILER_SUPPORTS_AVX512F)
|
||||
check_cxx_compiler_flag("-mavx512vl" COMPILER_SUPPORTS_AVX512VL)
|
||||
check_cxx_compiler_flag("-mavx512bw" COMPILER_SUPPORTS_AVX512BW)
|
||||
|
||||
if(COMPILER_SUPPORTS_AVX512F AND COMPILER_SUPPORTS_AVX512VL)
|
||||
add_compile_options(-mavx512f -mavx512vl)
|
||||
add_compile_definitions(DAW_ENABLE_AVX512)
|
||||
|
||||
if(COMPILER_SUPPORTS_AVX512BW)
|
||||
add_compile_options(-mavx512bw)
|
||||
add_compile_definitions(DAW_ENABLE_AVX512BW)
|
||||
endif()
|
||||
|
||||
message(STATUS "SIMD优化: 启用AVX512指令集")
|
||||
else()
|
||||
message(WARNING "编译器不支持完整AVX512,降级到AVX2")
|
||||
endif()
|
||||
endif()
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64|ARM64")
|
||||
# ARM平台:检测NEON支持
|
||||
include(CheckCXXCompilerFlag)
|
||||
|
||||
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|ARM64")
|
||||
# AArch64: NEON默认可用
|
||||
add_compile_definitions(DAW_ENABLE_NEON)
|
||||
message(STATUS "SIMD优化: 启用ARM64 NEON指令集")
|
||||
else()
|
||||
# ARM32: 检测NEON支持
|
||||
check_cxx_compiler_flag("-mfpu=neon" COMPILER_SUPPORTS_NEON)
|
||||
if(COMPILER_SUPPORTS_NEON)
|
||||
add_compile_options(-mfpu=neon)
|
||||
add_compile_definitions(DAW_ENABLE_NEON)
|
||||
message(STATUS "SIMD优化: 启用ARM32 NEON指令集")
|
||||
else()
|
||||
message(STATUS "SIMD优化: ARM32平台不支持NEON")
|
||||
endif()
|
||||
endif()
|
||||
else()
|
||||
message(STATUS "SIMD优化: 当前架构(${CMAKE_SYSTEM_PROCESSOR})不支持SIMD优化")
|
||||
if (MSVC)
|
||||
add_compile_definitions(ALICHO_MSVC=1)
|
||||
add_compile_definitions(ALICHO_GCC=0)
|
||||
add_compile_definitions(ALICHO_CLANG=0)
|
||||
elseif (CMAKE_CXX_COMPILER_ID MATCHES "GNU")
|
||||
add_compile_definitions(ALICHO_MSVC=0)
|
||||
add_compile_definitions(ALICHO_GCC=1)
|
||||
add_compile_definitions(ALICHO_CLANG=0)
|
||||
elseif (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
||||
add_compile_definitions(ALICHO_MSVC=0)
|
||||
add_compile_definitions(ALICHO_GCC=0)
|
||||
add_compile_definitions(ALICHO_CLANG=1)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
@@ -159,6 +95,5 @@ endfunction()
|
||||
# ================================================================================================
|
||||
function(apply_compiler_configuration)
|
||||
configure_compiler_options()
|
||||
configure_simd_optimizations()
|
||||
message(STATUS "编译器配置完成")
|
||||
endfunction()
|
||||
@@ -142,7 +142,12 @@ function(add_os_definitions target)
|
||||
# --- 阶段 3: 应用所有定义 ---
|
||||
# **关键:使用一次调用将所有定义添加到目标**
|
||||
if(definitions_list) # 确保列表非空
|
||||
target_compile_definitions(${target} PUBLIC ${definitions_list})
|
||||
get_target_property(target_type ${target} TYPE)
|
||||
if(target_type STREQUAL "INTERFACE_LIBRARY")
|
||||
target_compile_definitions(${target} INTERFACE ${definitions_list})
|
||||
else()
|
||||
target_compile_definitions(${target} PUBLIC ${definitions_list})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# 函数作用域结束时,alicho_def_* 变量会自动销毁,无需显式 unset
|
||||
|
||||
@@ -567,8 +567,13 @@ function(simple_library library_type)
|
||||
set(source_files "")
|
||||
retrieve_files(${CMAKE_CURRENT_SOURCE_DIR} source_files)
|
||||
add_library(${PROJECT_NAME} ${library_type} ${source_files})
|
||||
target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
target_link_libraries(${PROJECT_NAME} PUBLIC audio_backend_project_options)
|
||||
if(library_type STREQUAL "INTERFACE")
|
||||
target_include_directories(${PROJECT_NAME} INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
target_link_libraries(${PROJECT_NAME} INTERFACE audio_backend_project_options)
|
||||
else()
|
||||
target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
target_link_libraries(${PROJECT_NAME} PUBLIC audio_backend_project_options)
|
||||
endif()
|
||||
message(STATUS "创建库目标: ${PROJECT_NAME},类型: ${library_type},引用路径: ${CMAKE_CURRENT_SOURCE_DIR}")
|
||||
add_os_definitions(${PROJECT_NAME})
|
||||
endfunction()
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -40,6 +40,7 @@
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
#include <atomic>
|
||||
|
||||
// 前向声明
|
||||
class audio_processing_task;
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
#include "lazy_singleton.h"
|
||||
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "plugin_type.h"
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
#include <cstdint>
|
||||
#include <span>
|
||||
#include <atomic>
|
||||
#include <vector>
|
||||
|
||||
#include "aligned_allocator.h"
|
||||
#include "transport/audio_processing_shm.h"
|
||||
|
||||
39
src/misc/lib_handle.h
Normal file
39
src/misc/lib_handle.h
Normal file
@@ -0,0 +1,39 @@
|
||||
#pragma once
|
||||
#include <filesystem>
|
||||
#include <functional>
|
||||
|
||||
class lib_handle {
|
||||
public:
|
||||
lib_handle() = default;
|
||||
~lib_handle() { close(); }
|
||||
|
||||
auto open(const std::filesystem::path& lib_path) -> bool;
|
||||
void close();
|
||||
|
||||
// 通过函数名称和函数签名获取函数指针
|
||||
template<typename Func>
|
||||
auto get_function_by_name(const std::string& func_name) -> std::function<Func> {
|
||||
auto raw_func_ptr = get_raw_function(func_name);
|
||||
if (!raw_func_ptr) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// 将原始函数指针转换为函数指针类型
|
||||
// Func 是函数签名,如 int(float, double)
|
||||
// Func* 是对应的函数指针类型,如 int(*)(float, double)
|
||||
using func_ptr_type = Func*;
|
||||
auto typed_func_ptr = reinterpret_cast<func_ptr_type>(raw_func_ptr);
|
||||
|
||||
return std::function<Func>(typed_func_ptr);
|
||||
}
|
||||
private:
|
||||
auto get_raw_function(const std::string& func_name) -> void*;
|
||||
void* handle_{nullptr};
|
||||
};
|
||||
|
||||
// 通过函数签名自动推导类型并获取函数
|
||||
// 用法示例:auto func = get_function_by_func_signature(handle, my_function);
|
||||
// 其中 my_function 是实际的函数名称
|
||||
#define get_function_by_func_signature(lib_handle_instance, func_signature) \
|
||||
(lib_handle_instance).get_function_by_name<std::remove_pointer_t<decltype(&func_signature)>>(#func_signature)
|
||||
|
||||
24
src/misc/linux/lib_handle.cpp
Normal file
24
src/misc/linux/lib_handle.cpp
Normal file
@@ -0,0 +1,24 @@
|
||||
#include "lib_handle.h"
|
||||
|
||||
#include <dlfcn.h>
|
||||
|
||||
auto lib_handle::open(const std::filesystem::path& lib_path) -> bool {
|
||||
close();
|
||||
|
||||
handle_ = dlopen(lib_path.c_str(), RTLD_LAZY);
|
||||
return handle_ != nullptr;
|
||||
}
|
||||
|
||||
void lib_handle::close() {
|
||||
if (handle_) {
|
||||
dlclose(handle_);
|
||||
handle_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
auto lib_handle::get_raw_function(const std::string& func_name) -> void* {
|
||||
if (!handle_) {
|
||||
return nullptr;
|
||||
}
|
||||
return dlsym(handle_, func_name.c_str());
|
||||
}
|
||||
@@ -16,41 +16,14 @@
|
||||
|
||||
#include "thread_tool.h"
|
||||
|
||||
#include <pthread.h>
|
||||
#include <sched.h>
|
||||
#include <cstring>
|
||||
#include <cerrno>
|
||||
#include "logger.h"
|
||||
|
||||
/**
|
||||
* @brief 设置线程CPU亲和性(Linux占位实现)
|
||||
*
|
||||
* 当前返回false表示功能未实现。
|
||||
*
|
||||
* ## 计划实现
|
||||
*
|
||||
* 完整实现应该使用pthread_setaffinity_np或sched_setaffinity:
|
||||
*
|
||||
* @code
|
||||
* #include <pthread.h>
|
||||
* #include <sched.h>
|
||||
*
|
||||
* bool thread_set_affinity(boost::thread& thread, int core_id) {
|
||||
* cpu_set_t cpuset;
|
||||
* CPU_ZERO(&cpuset); // 清空CPU集合
|
||||
* CPU_SET(core_id, &cpuset); // 设置指定的CPU核心
|
||||
*
|
||||
* // 使用pthread API设置亲和性
|
||||
* int result = pthread_setaffinity_np(
|
||||
* thread.native_handle(), // pthread线程句柄
|
||||
* sizeof(cpu_set_t), // CPU集合大小
|
||||
* &cpuset // CPU集合指针
|
||||
* );
|
||||
*
|
||||
* if (result != 0) {
|
||||
* log_module_error(THREAD_TOOL_LOG_MODULE,
|
||||
* "无法设置线程亲和性到核心{}: {}",
|
||||
* core_id, strerror(result));
|
||||
* return false;
|
||||
* }
|
||||
* return true;
|
||||
* }
|
||||
* @endcode
|
||||
*
|
||||
* ### API说明
|
||||
* - cpu_set_t:CPU集合类型,表示一组CPU核心
|
||||
* - CPU_ZERO:清空CPU集合的所有位
|
||||
@@ -70,47 +43,30 @@
|
||||
* @todo 实现实际的线程亲和性设置功能
|
||||
*/
|
||||
bool thread_set_affinity(boost::thread& thread, int core_id) {
|
||||
// Linux implementation can be added here
|
||||
// TODO: 使用pthread_setaffinity_np或sched_setaffinity实现
|
||||
return false; // Placeholder - 占位实现,表示功能未实现
|
||||
// 创建CPU集合
|
||||
cpu_set_t cpuset;
|
||||
CPU_ZERO(&cpuset); // 清空CPU集合
|
||||
CPU_SET(core_id, &cpuset); // 设置指定的CPU核心
|
||||
|
||||
// 使用pthread API设置亲和性
|
||||
const int result = pthread_setaffinity_np(
|
||||
thread.native_handle(), // pthread线程句柄
|
||||
sizeof(cpu_set_t), // CPU集合大小
|
||||
&cpuset // CPU集合指针
|
||||
);
|
||||
|
||||
if (result != 0) {
|
||||
log_module_error(THREAD_TOOL_LOG_MODULE,
|
||||
"无法将线程亲和性设置为核心{}: {}",
|
||||
core_id, strerror(result));
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief 设置线程名称(Linux占位实现)
|
||||
*
|
||||
* 当前返回false表示功能未实现。
|
||||
*
|
||||
* ## 计划实现
|
||||
*
|
||||
* 完整实现应该使用pthread_setname_np:
|
||||
*
|
||||
* @code
|
||||
* #include <pthread.h>
|
||||
* #include <cstring>
|
||||
*
|
||||
* bool thread_set_name(boost::thread& thread, const char* name) {
|
||||
* // Linux限制线程名称最长为15个字符(不含null终止符)
|
||||
* // 因此需要截断过长的名称
|
||||
* char truncated_name[16]; // 15字符 + null终止符
|
||||
* strncpy(truncated_name, name, 15);
|
||||
* truncated_name[15] = '\0';
|
||||
*
|
||||
* // 使用pthread API设置线程名称
|
||||
* int result = pthread_setname_np(
|
||||
* thread.native_handle(), // pthread线程句柄
|
||||
* truncated_name // 线程名称(最长15字符)
|
||||
* );
|
||||
*
|
||||
* if (result != 0) {
|
||||
* log_module_error(THREAD_TOOL_LOG_MODULE,
|
||||
* "无法设置线程名称为 {}: {}",
|
||||
* name, strerror(result));
|
||||
* return false;
|
||||
* }
|
||||
* return true;
|
||||
* }
|
||||
* @endcode
|
||||
*
|
||||
* ### API说明
|
||||
* - pthread_setname_np:Linux特定的线程命名API
|
||||
* - 线程名称限制为15个字符(不包括null终止符)
|
||||
@@ -138,8 +94,24 @@ bool thread_set_affinity(boost::thread& thread, int core_id) {
|
||||
* @todo 添加名称长度检查和截断逻辑
|
||||
*/
|
||||
bool thread_set_name(boost::thread& thread, const char* name) {
|
||||
// Linux implementation can be added here
|
||||
// TODO: 使用pthread_setname_np实现
|
||||
// 注意:Linux限制线程名称最长为15个字符
|
||||
return false; // Placeholder - 占位实现,表示功能未实现
|
||||
// Linux限制线程名称最长为15个字符(不含null终止符)
|
||||
// 因此需要截断过长的名称
|
||||
char truncated_name[16]; // 15字符 + null终止符
|
||||
strncpy(truncated_name, name, 15);
|
||||
truncated_name[15] = '\0';
|
||||
|
||||
// 使用pthread API设置线程名称
|
||||
const int result = pthread_setname_np(
|
||||
thread.native_handle(), // pthread线程句柄
|
||||
truncated_name // 线程名称(最长15字符)
|
||||
);
|
||||
|
||||
if (result != 0) {
|
||||
log_module_error(THREAD_TOOL_LOG_MODULE,
|
||||
"无法设置线程名称为 {}: {}",
|
||||
name, strerror(result));
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
203
src/misc/macos/lib_handle.cpp
Normal file
203
src/misc/macos/lib_handle.cpp
Normal file
@@ -0,0 +1,203 @@
|
||||
/**
|
||||
* @file lib_handle.cpp
|
||||
* @brief macOS平台动态库加载工具实现
|
||||
*
|
||||
* 实现了lib_handle.h中声明的跨平台动态库加载工具的macOS版本。
|
||||
* 使用POSIX标准的dlopen/dlclose/dlsym API来实现动态库的加载、卸载和函数查找功能。
|
||||
* 这些API在macOS上通过dyld(动态链接器)实现。
|
||||
*
|
||||
* ## macOS动态库说明
|
||||
* - dylib:macOS标准动态库格式(类似Linux的.so)
|
||||
* - framework:macOS特有的打包格式,包含库、头文件和资源
|
||||
* - bundle:可加载的插件格式(.bundle或.plugin)
|
||||
*
|
||||
* ## POSIX API说明
|
||||
* - dlopen:加载动态库
|
||||
* - dlclose:卸载动态库
|
||||
* - dlsym:从动态库中获取符号地址
|
||||
* - dlerror:获取最后一次错误信息
|
||||
*
|
||||
* @note 使用.cpp扩展名(不是.mm),因为不需要Objective-C功能
|
||||
* @note 与Linux实现基本相同,但加载路径和搜索规则有所不同
|
||||
*/
|
||||
|
||||
#include "lib_handle.h"
|
||||
|
||||
#include <dlfcn.h>
|
||||
|
||||
/**
|
||||
* @brief 打开动态库(macOS实现)
|
||||
*
|
||||
* 使用POSIX标准的dlopen加载动态库(.dylib、.framework或.bundle)。
|
||||
*
|
||||
* ## 实现细节
|
||||
*
|
||||
* ### macOS动态库类型
|
||||
* 1. **dylib** - 标准动态库
|
||||
* - 扩展名:.dylib
|
||||
* - 位置:/usr/lib、/usr/local/lib等
|
||||
* - 示例:libMyLib.dylib
|
||||
*
|
||||
* 2. **Framework** - macOS特有格式
|
||||
* - 位置:/System/Library/Frameworks、/Library/Frameworks
|
||||
* - 结构:MyFramework.framework/MyFramework
|
||||
* - 包含:库、头文件、资源
|
||||
*
|
||||
* 3. **Bundle** - 可加载插件
|
||||
* - 扩展名:.bundle、.plugin
|
||||
* - 常用于插件系统
|
||||
*
|
||||
* ### dlopen标志说明
|
||||
* - RTLD_LAZY:延迟解析符号(性能更好)
|
||||
* - 仅在符号首次使用时解析
|
||||
* - 如果符号不存在,会在使用时才报错
|
||||
*
|
||||
* - RTLD_NOW:立即解析所有符号
|
||||
* - 加载时解析所有符号
|
||||
* - 如果有未定义符号,dlopen会失败
|
||||
*
|
||||
* - RTLD_LOCAL:符号仅在本库内可见(默认)
|
||||
* - RTLD_GLOBAL:符号对后续加载的库可见
|
||||
*
|
||||
* ### macOS搜索路径
|
||||
* dlopen按以下顺序搜索:
|
||||
* 1. @executable_path - 可执行文件所在目录
|
||||
* 2. @loader_path - 加载库所在目录
|
||||
* 3. @rpath - 运行时搜索路径
|
||||
* 4. DYLD_LIBRARY_PATH环境变量(如果设置)
|
||||
* 5. /usr/local/lib
|
||||
* 6. /usr/lib
|
||||
*
|
||||
* ### 系统完整性保护(SIP)
|
||||
* macOS 10.11+启用了SIP,限制:
|
||||
* - DYLD_LIBRARY_PATH在受保护进程中被忽略
|
||||
* - 无法修改系统库路径
|
||||
* - 某些目录需要特殊权限
|
||||
*
|
||||
* ### 错误处理
|
||||
* dlopen失败时返回nullptr,使用dlerror()获取错误信息:
|
||||
* - "image not found" - 库文件不存在
|
||||
* - "no suitable image found" - 架构不匹配
|
||||
* - "symbol not found" - 缺少符号(RTLD_NOW模式)
|
||||
* - "Library not loaded" - 缺少依赖库
|
||||
*
|
||||
* @param lib_path 动态库路径(支持相对路径、绝对路径、@rpath等)
|
||||
* @return bool true表示成功,false表示失败
|
||||
*
|
||||
* @note 会先调用close()关闭已打开的库
|
||||
* @note 使用RTLD_LAZY以获得更好的性能
|
||||
* @note 失败时可以调用dlerror()获取详细错误信息
|
||||
*/
|
||||
auto lib_handle::open(const std::filesystem::path& lib_path) -> bool {
|
||||
close();
|
||||
|
||||
// 使用RTLD_LAZY延迟加载符号
|
||||
// 在macOS上,dlopen可以加载.dylib、.framework和.bundle
|
||||
handle_ = dlopen(lib_path.c_str(), RTLD_LAZY);
|
||||
return handle_ != nullptr;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief 关闭动态库(macOS实现)
|
||||
*
|
||||
* 使用POSIX标准的dlclose卸载动态库。
|
||||
*
|
||||
* ## 实现细节
|
||||
*
|
||||
* ### 引用计数
|
||||
* - dlclose递减库的引用计数
|
||||
* - 当引用计数降为0时,库才会真正卸载
|
||||
* - 如果库被多次dlopen,需要相同次数的dlclose
|
||||
*
|
||||
* ### 析构函数
|
||||
* 库卸载时会调用:
|
||||
* - C++全局对象的析构函数
|
||||
* - __attribute__((destructor))标记的函数
|
||||
* - atexit()注册的清理函数
|
||||
*
|
||||
* ### macOS特性
|
||||
* - 某些系统库可能无法卸载(返回错误但不影响程序)
|
||||
* - Framework的卸载也会卸载其资源和依赖
|
||||
* - 卸载时dyld会处理依赖关系
|
||||
*
|
||||
* ### 线程安全
|
||||
* - dlclose是线程安全的
|
||||
* - 但需要确保没有线程正在使用库中的代码
|
||||
* - 正在执行的函数可能导致崩溃
|
||||
*
|
||||
* @note 调用后handle_会被设置为nullptr
|
||||
* @note 重复调用是安全的(会检查handle_是否为空)
|
||||
* @note 卸载失败时dlerror()会返回错误信息
|
||||
*/
|
||||
void lib_handle::close() {
|
||||
if (handle_) {
|
||||
dlclose(handle_);
|
||||
handle_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief 获取函数地址(macOS实现)
|
||||
*
|
||||
* 使用POSIX标准的dlsym从动态库中获取符号地址。
|
||||
*
|
||||
* ## 实现细节
|
||||
*
|
||||
* ### 符号查找
|
||||
* dlsym可以查找:
|
||||
* - C函数:直接使用函数名
|
||||
* - C++函数:需要使用extern "C"避免名称修饰
|
||||
* - 全局变量:可以获取变量地址
|
||||
* - 弱符号:如果存在返回地址,否则返回nullptr
|
||||
*
|
||||
* ### 名称修饰(Name Mangling)
|
||||
* C++编译器会修饰函数名以支持重载:
|
||||
* @code
|
||||
* // C++函数
|
||||
* int add(int a, int b); // 可能被修饰为 __Z3addii
|
||||
*
|
||||
* // 避免修饰
|
||||
* extern "C" int add(int a, int b); // 保持为 add
|
||||
* @endcode
|
||||
*
|
||||
* ### macOS符号约定
|
||||
* - 前导下划线:C符号通常有前导下划线(但dlsym会自动处理)
|
||||
* - 隐藏符号:使用__attribute__((visibility("hidden")))的符号无法查找
|
||||
* - 弱符号:使用__attribute__((weak))的符号可以被覆盖
|
||||
*
|
||||
* ### 错误处理
|
||||
* dlsym失败返回nullptr,常见原因:
|
||||
* - 符号不存在
|
||||
* - 符号被标记为隐藏
|
||||
* - 名称修饰不匹配
|
||||
* - 库未正确导出符号
|
||||
*
|
||||
* ### 使用示例
|
||||
* @code
|
||||
* // 假设库中有:extern "C" int calculate(int x);
|
||||
* lib_handle lib;
|
||||
* lib.open("libmath.dylib");
|
||||
*
|
||||
* // 获取函数指针
|
||||
* auto func = lib.get_function<int(int)>("calculate");
|
||||
* if (func) {
|
||||
* int result = func(42);
|
||||
* }
|
||||
* @endcode
|
||||
*
|
||||
* @param func_name 符号名称(C风格字符串)
|
||||
* @return void* 符号地址指针,失败时返回nullptr
|
||||
*
|
||||
* @note 如果handle_为空,直接返回nullptr
|
||||
* @note 返回的指针需要转换为正确的类型才能使用
|
||||
* @note 可以使用dlerror()获取失败原因
|
||||
*/
|
||||
auto lib_handle::get_raw_function(const std::string& func_name) -> void* {
|
||||
if (!handle_) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// dlsym返回符号地址(函数或变量)
|
||||
// 在macOS上,dlsym会自动处理前导下划线
|
||||
return dlsym(handle_, func_name.c_str());
|
||||
}
|
||||
130
src/misc/windows/lib_handle.cpp
Normal file
130
src/misc/windows/lib_handle.cpp
Normal file
@@ -0,0 +1,130 @@
|
||||
/**
|
||||
* @file lib_handle.cpp
|
||||
* @brief Windows平台动态库加载工具实现
|
||||
*
|
||||
* 实现了lib_handle.h中声明的跨平台动态库加载工具的Windows版本。
|
||||
* 使用Windows API(LoadLibrary、FreeLibrary、GetProcAddress)来实现
|
||||
* 动态库的加载、卸载和函数查找功能。
|
||||
*
|
||||
* ## Windows API说明
|
||||
* - LoadLibrary:加载动态链接库(DLL)
|
||||
* - FreeLibrary:卸载动态链接库
|
||||
* - GetProcAddress:从DLL中获取函数地址
|
||||
*
|
||||
* @note 仅在Windows平台编译
|
||||
*/
|
||||
|
||||
#include "lib_handle.h"
|
||||
|
||||
#include <windows.h>
|
||||
|
||||
/**
|
||||
* @brief 打开动态库(Windows实现)
|
||||
*
|
||||
* 使用Windows API LoadLibrary加载指定路径的DLL文件。
|
||||
*
|
||||
* ## 实现细节
|
||||
*
|
||||
* ### LoadLibrary行为
|
||||
* - 如果DLL已经加载,会增加其引用计数
|
||||
* - 搜索顺序:
|
||||
* 1. 应用程序目录
|
||||
* 2. 系统目录(System32)
|
||||
* 3. Windows目录
|
||||
* 4. 当前目录
|
||||
* 5. PATH环境变量中的目录
|
||||
*
|
||||
* ### 错误处理
|
||||
* LoadLibrary失败时返回NULL,可能的原因:
|
||||
* - 文件不存在
|
||||
* - 不是有效的DLL文件
|
||||
* - 缺少依赖的DLL
|
||||
* - 架构不匹配(32位/64位)
|
||||
* - 权限不足
|
||||
*
|
||||
* @param lib_path DLL文件的路径
|
||||
* @return bool true表示成功,false表示失败
|
||||
*
|
||||
* @note 会先调用close()关闭已打开的库
|
||||
* @note 失败时可以使用GetLastError()获取详细错误代码
|
||||
*/
|
||||
auto lib_handle::open(const std::filesystem::path& lib_path) -> bool {
|
||||
close();
|
||||
|
||||
// 使用LoadLibrary加载DLL
|
||||
// lib_path.c_str()返回const char*,在Windows上会自动转换为所需的类型
|
||||
handle_ = LoadLibraryA(lib_path.string().c_str());
|
||||
return handle_ != nullptr;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief 关闭动态库(Windows实现)
|
||||
*
|
||||
* 使用Windows API FreeLibrary卸载DLL。
|
||||
*
|
||||
* ## 实现细节
|
||||
*
|
||||
* ### 引用计数
|
||||
* - FreeLibrary递减DLL的引用计数
|
||||
* - 当引用计数降为0时,DLL才会真正卸载
|
||||
* - 如果DLL被多次LoadLibrary,需要相同次数的FreeLibrary
|
||||
*
|
||||
* ### 线程安全
|
||||
* - DLL的DllMain函数会在卸载时被调用(DLL_PROCESS_DETACH)
|
||||
* - 需要确保没有其他线程正在使用DLL中的代码或数据
|
||||
*
|
||||
* @note 调用后handle_会被设置为nullptr
|
||||
* @note 重复调用是安全的(会检查handle_是否为空)
|
||||
*/
|
||||
void lib_handle::close() {
|
||||
if (handle_) {
|
||||
FreeLibrary(static_cast<HMODULE>(handle_));
|
||||
handle_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief 获取函数地址(Windows实现)
|
||||
*
|
||||
* 使用Windows API GetProcAddress从DLL中获取导出函数的地址。
|
||||
*
|
||||
* ## 实现细节
|
||||
*
|
||||
* ### 函数查找
|
||||
* GetProcAddress通过函数名查找:
|
||||
* - C函数:直接使用函数名
|
||||
* - C++函数:需要使用extern "C"避免名称修饰(name mangling)
|
||||
* - 导出序号:也可以使用序号而不是名称(不推荐)
|
||||
*
|
||||
* ### 名称修饰
|
||||
* C++编译器会对函数名进行修饰,导致查找失败。解决方法:
|
||||
* @code
|
||||
* // 在DLL中声明函数时使用
|
||||
* extern "C" __declspec(dllexport) int my_function(int x);
|
||||
* @endcode
|
||||
*
|
||||
* ### 调用约定
|
||||
* 需要确保函数的调用约定匹配:
|
||||
* - __cdecl:C默认调用约定
|
||||
* - __stdcall:Windows API调用约定
|
||||
* - __fastcall:快速调用约定
|
||||
*
|
||||
* @param func_name 函数名称(C风格字符串)
|
||||
* @return void* 函数地址指针,失败时返回nullptr
|
||||
*
|
||||
* @note 如果handle_为空,直接返回nullptr
|
||||
* @note 返回的指针需要转换为正确的函数指针类型才能调用
|
||||
* @note 使用FARPROC类型表示函数指针,然后转换为void*
|
||||
*/
|
||||
auto lib_handle::get_raw_function(const std::string& func_name) -> void* {
|
||||
if (!handle_) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// GetProcAddress返回FARPROC类型(函数指针)
|
||||
// FARPROC是Windows定义的通用函数指针类型
|
||||
// 将其转换为void*以保持跨平台的接口一致性
|
||||
return reinterpret_cast<void*>(
|
||||
GetProcAddress(static_cast<HMODULE>(handle_), func_name.c_str())
|
||||
);
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
project(alicho_network)
|
||||
|
||||
find_package(Boost COMPONENTS interprocess date_time thread CONFIG REQUIRED)
|
||||
find_package(zeromq REQUIRED)
|
||||
find_package(cppzmq CONFIG REQUIRED)
|
||||
find_package(cppzmq REQUIRED)
|
||||
find_package(yalantinglibs CONFIG REQUIRED)
|
||||
|
||||
|
||||
@@ -88,7 +88,7 @@ namespace alicho {
|
||||
|
||||
process_error process_monitor::update_status() {
|
||||
try {
|
||||
auto previous_state = monitored_process_.state;
|
||||
// auto previous_state = monitored_process_.state;
|
||||
|
||||
// 检查进程状态
|
||||
bool is_running = check_process_running();
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
project(alicho_simd)
|
||||
|
||||
simple_library(STATIC)
|
||||
target_link_libraries(${PROJECT_NAME} PUBLIC alicho_misc)
|
||||
add_subdirectory(simd_interface)
|
||||
add_subdirectory(simd_scaler)
|
||||
add_subdirectory(simd_sse)
|
||||
add_subdirectory(simd_avx)
|
||||
add_subdirectory(simd_avx512)
|
||||
add_subdirectory(misc)
|
||||
|
||||
@@ -1,158 +0,0 @@
|
||||
/**
|
||||
* @file simd_audio_processing.cpp
|
||||
* @brief SIMD音频处理函数注册模块实现
|
||||
*
|
||||
* 本文件负责将所有音频处理函数(标量和SIMD优化版本)注册到SIMD函数调度器中。
|
||||
* 注册过程采用分层策略:
|
||||
* 1. 根据平台(x86或ARM)选择合适的SIMD实现
|
||||
* 2. 为每个SIMD指令集版本注册对应的函数实现
|
||||
* 3. 运行时根据CPU特性自动选择最优实现
|
||||
*
|
||||
* 注册流程:
|
||||
* - 标量实现:所有平台通用的基准实现
|
||||
* - x86平台:SSE/SSE3/SSE4/AVX/AVX2/AVX512优化实现
|
||||
* - ARM平台:NEON优化实现
|
||||
*/
|
||||
|
||||
#include "simd_audio_processing.h"
|
||||
|
||||
#include "scalar_audio_processing_func.h"
|
||||
#include "simd_func_dispatcher.h"
|
||||
#include "x86_simd_audio_processing_func.h"
|
||||
#include "arm_simd_audio_processing_func.h"
|
||||
|
||||
/**
|
||||
* @brief x86平台自动注册宏 - 注册所有x86 SIMD版本
|
||||
*
|
||||
* 此宏为指定函数注册多个SIMD实现版本:
|
||||
* - SCALAR: 标量实现(所有平台通用的基准版本)
|
||||
* - SSE/SSE3/SSE4: 使用相同的SSE实现(128位向量,处理4个float)
|
||||
* - AVX/AVX2: 使用相同的AVX实现(256位向量,处理8个float)
|
||||
* - AVX512: 最新的AVX-512实现(512位向量,处理16个float)
|
||||
*
|
||||
* 注册顺序说明:
|
||||
* 1. 先注册标量版本作为后备实现
|
||||
* 2. 按指令集从旧到新注册SIMD版本
|
||||
* 3. 运行时调度器会根据CPU特性选择最优版本
|
||||
*
|
||||
* 注意:SSE/SSE3/SSE4共用同一实现是因为这些指令集间差异不影响音频处理性能
|
||||
*/
|
||||
#if ALICHO_PLATFORM_X86
|
||||
#define AUTO_REGISTER_SIMD_FUNCTION(func_name)\
|
||||
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::SCALAR, scalar_audio_processing_func::func_name); \
|
||||
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::SSE, x86_simd_audio_processing_func::func_name##_sse); \
|
||||
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::SSE3, x86_simd_audio_processing_func::func_name##_sse); \
|
||||
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::SSE4, x86_simd_audio_processing_func::func_name##_sse); \
|
||||
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::AVX, x86_simd_audio_processing_func::func_name##_avx); \
|
||||
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::AVX2, x86_simd_audio_processing_func::func_name##_avx); \
|
||||
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::AVX512, x86_simd_audio_processing_func::func_name##_avx512);
|
||||
|
||||
/**
|
||||
* @brief ARM平台自动注册宏 - 注册标量和NEON版本
|
||||
*
|
||||
* ARM平台注册策略:
|
||||
* - SCALAR: 标量实现(所有平台通用)
|
||||
* - NEON: ARM的SIMD指令集实现(128位向量,处理4个float)
|
||||
*
|
||||
* 注册顺序说明:
|
||||
* 1. 先注册标量版本作为后备实现
|
||||
* 2. 注册NEON优化版本(大多数现代ARM处理器都支持)
|
||||
* 3. 运行时根据CPU是否支持NEON自动选择
|
||||
*/
|
||||
#elif ALICHO_PLATFORM_ARM
|
||||
#define AUTO_REGISTER_SIMD_FUNCTION(func_name) \
|
||||
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::SCALAR, scalar_audio_processing_func::func_name); \
|
||||
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::NEON, neon_simd_audio_processing_func::func_name##_neon);
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief 强制使用标量实现的注册宏
|
||||
*
|
||||
* 某些函数可能由于以下原因只使用标量实现:
|
||||
* 1. 算法特性不适合SIMD优化(如分支过多)
|
||||
* 2. SIMD实现收益不明显甚至可能降低性能
|
||||
* 3. 实现复杂度过高,维护成本超过性能收益
|
||||
*
|
||||
* 此宏将标量实现注册到所有SIMD版本槽位,确保:
|
||||
* - 无论CPU支持何种指令集,都使用相同的标量实现
|
||||
* - 避免因缺少SIMD实现导致的运行时错误
|
||||
* - 保持API一致性,调用方无需关心实现细节
|
||||
*
|
||||
* 当前使用此宏的函数:
|
||||
* - apply_gain: 虽然可以SIMD优化,但此处暂时使用标量版本
|
||||
*/
|
||||
#define FORCE_SCALAR_SIMD_FUNCTION(func_name) \
|
||||
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::SCALAR, scalar_audio_processing_func::func_name); \
|
||||
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::SSE, scalar_audio_processing_func::func_name); \
|
||||
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::SSE3, scalar_audio_processing_func::func_name); \
|
||||
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::SSE4, scalar_audio_processing_func::func_name); \
|
||||
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::AVX, scalar_audio_processing_func::func_name); \
|
||||
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::AVX2, scalar_audio_processing_func::func_name); \
|
||||
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::AVX512, scalar_audio_processing_func::func_name); \
|
||||
REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::NEON, scalar_audio_processing_func::func_name);
|
||||
|
||||
/**
|
||||
* @brief 注册所有音频处理函数到SIMD调度器
|
||||
*
|
||||
* 此函数在程序初始化时调用,负责:
|
||||
* 1. 将所有音频处理函数的各个SIMD版本注册到调度器
|
||||
* 2. 建立函数名到实现的映射关系
|
||||
* 3. 为运行时动态分发做准备
|
||||
*
|
||||
* 注册的函数列表:
|
||||
* - mix_audio: 音频混合(两路音频相加)
|
||||
* - apply_gain: 音量增益调节(暂时使用标量版本)
|
||||
* - calculate_rms: RMS电平计算(均方根值)
|
||||
* - calculate_peak: 峰值电平检测
|
||||
* - normalize_audio: 音频归一化处理
|
||||
* - stereo_to_mono: 立体声转单声道
|
||||
* - limit_audio: 音频限幅器(动态范围压缩)
|
||||
* - fade_audio: 淡入淡出效果
|
||||
* - simple_eq: 简单三段均衡器(低频/中频/高频)
|
||||
*
|
||||
* 注册顺序考虑:
|
||||
* - 按功能类型分组:基础操作 -> 分析 -> 效果处理
|
||||
* - 简单函数在前,复杂函数在后
|
||||
* - 便于理解和维护
|
||||
*
|
||||
* 错误处理:
|
||||
* - REGISTER_SIMD_FUNCTION宏内部会处理重复注册
|
||||
* - 如果函数不存在会在编译时报错(类型安全)
|
||||
*/
|
||||
void audio_processing_registry::register_all_functions() {
|
||||
AUTO_REGISTER_SIMD_FUNCTION(mix_audio); // 音频混合
|
||||
FORCE_SCALAR_SIMD_FUNCTION(apply_gain); // 增益调节(标量版本)
|
||||
AUTO_REGISTER_SIMD_FUNCTION(calculate_rms); // RMS计算
|
||||
AUTO_REGISTER_SIMD_FUNCTION(calculate_peak); // 峰值检测
|
||||
AUTO_REGISTER_SIMD_FUNCTION(normalize_audio); // 归一化
|
||||
AUTO_REGISTER_SIMD_FUNCTION(stereo_to_mono); // 立体声转单声道
|
||||
AUTO_REGISTER_SIMD_FUNCTION(limit_audio); // 限幅器
|
||||
AUTO_REGISTER_SIMD_FUNCTION(fade_audio); // 淡入淡出
|
||||
AUTO_REGISTER_SIMD_FUNCTION(simple_eq); // 简单均衡器
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief 打印所有已注册函数的状态信息
|
||||
*
|
||||
* 此函数用于调试和诊断,输出内容包括:
|
||||
* 1. 已注册的函数名称列表
|
||||
* 2. 每个函数可用的SIMD版本
|
||||
* 3. 当前CPU支持的指令集
|
||||
* 4. 运行时将使用的具体实现版本
|
||||
*
|
||||
* 使用场景:
|
||||
* - 开发调试:验证函数是否正确注册
|
||||
* - 性能分析:确认使用了最优的SIMD版本
|
||||
* - 问题诊断:检查CPU特性检测是否正常
|
||||
* - 用户支持:提供系统配置信息
|
||||
*
|
||||
* 输出示例:
|
||||
* "Function: mix_audio
|
||||
* - SCALAR: available
|
||||
* - SSE: available
|
||||
* - AVX: available (selected)
|
||||
* - AVX512: not available (CPU not supported)"
|
||||
*/
|
||||
void audio_processing_registry::print_available_functions() {
|
||||
simd_func_dispatcher::instance().print_registry_status();
|
||||
}
|
||||
@@ -1,146 +0,0 @@
|
||||
/**
|
||||
* @file simd_audio_processing.h
|
||||
* @brief SIMD音频处理函数注册器 - 音频处理功能的统一注册和管理入口
|
||||
*
|
||||
* 本文件定义了音频处理函数的注册器类,负责将所有音频处理函数
|
||||
* (包括标量版本和各种SIMD优化版本)注册到函数分发器中。
|
||||
*
|
||||
* 核心职责:
|
||||
* 1. **函数注册**:将所有音频处理函数注册到SIMD函数分发器
|
||||
* 2. **平台适配**:根CPU特性选择合适的SIMD实现
|
||||
* 3. **性能优化**:确保运行时使用最优的函数实现
|
||||
* 4. **调试支持**:提供可用函数列表打印功能
|
||||
*
|
||||
* 与函数分发器的关系:
|
||||
* ```
|
||||
* simd_func_dispatcher (分发器)
|
||||
* ↑
|
||||
* | 注册
|
||||
* |
|
||||
* audio_processing_registry (本文件)
|
||||
* |
|
||||
* | 包含
|
||||
* ↓
|
||||
* 标量版本 + SSE + AVX + AVX512 + NEON (实现)
|
||||
* ```
|
||||
*
|
||||
* 支持的音频处理功能:
|
||||
* - 音频混合 (mix_audio)
|
||||
* - 增益控制 (apply_gain)
|
||||
* - RMS计算 (calculate_rms)
|
||||
* - 峰值检测 (calculate_peak)
|
||||
* - 音频归一化 (normalize_audio)
|
||||
* - 立体声转单声道 (stereo_to_mono)
|
||||
* - 音频限幅 (limit_audio)
|
||||
* - 淡入淡出 (fade_audio)
|
||||
* - 三段均衡器 (simple_eq)
|
||||
*
|
||||
* 使用方式:
|
||||
* ```cpp
|
||||
* // 在程序启动时调用一次
|
||||
* audio_processing_registry::register_all_functions();
|
||||
*
|
||||
* // 调试时查看可用函数
|
||||
* audio_processing_registry::print_available_functions();
|
||||
*
|
||||
* // 之后通过函数分发器使用
|
||||
* auto func = simd_func_dispatcher::get_function<...>("mix_audio");
|
||||
* ```
|
||||
*
|
||||
* @note 此类只包含静态方法,不需要实例化
|
||||
* @see simd_func_dispatcher.h 函数分发器的定义
|
||||
* @see scalar_audio_processing_func.h 标量实现
|
||||
* @see x86_simd_audio_processing_func.h x86 SIMD实现
|
||||
* @see arm_simd_audio_processing_func.h ARM NEON实现
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
/**
|
||||
* @class audio_processing_registry
|
||||
* @brief 音频处理函数注册器
|
||||
*
|
||||
* 负责将所有音频处理函数注册到SIMD函数分发器中。
|
||||
* 注册过程会根据当前CPU的特性,自动选择最优的实现版本。
|
||||
*
|
||||
* 注册顺序和优先级:
|
||||
* 1. 首先注册标量版本(保底实现,所有平台可用)
|
||||
* 2. 然后注册SIMD版本如果CPU支持):
|
||||
* - x86平台:SSE -> AVX -> AVX512(按性能递增)
|
||||
* - ARM平台:NEON
|
||||
*
|
||||
* 函数分发器会根据注册顺序,优先使用后注册的高性能版本。
|
||||
*
|
||||
* 线程安全性:
|
||||
* - register_all_functions() 应该在程序启动时调用一次
|
||||
* - 不是线程安全的,不应该并发调用
|
||||
* - 注册完成后,使用函数是线程安全的
|
||||
*/
|
||||
class audio_processing_registry {
|
||||
public:
|
||||
/**
|
||||
* @brief 注册所有音频处理函数
|
||||
*
|
||||
* 将所有支持的音频处理函数注册到SIMD函数分发器中。
|
||||
* 此函数会检测当前CPU特性,并注册所有兼容的实现版本。
|
||||
*
|
||||
* 注册的函数包括:
|
||||
* - 音频信号处理:mix_audio, apply_gain
|
||||
* - 音频分析:calculate_rms, calculate_peak
|
||||
* - 音频处理效果:normalize_audio, stereo_to_mono, limit_audio, fade_audio, simple_eq
|
||||
*
|
||||
* 每个函数都会注册多个版本(如果CPU支持):
|
||||
* - 标量版本(必定存在)
|
||||
* - SSE版本(x86平台,如果支持)
|
||||
* - AVX版本(x86平台,如果支持)
|
||||
* - AVX512版本(x86平台,如果支持)
|
||||
* - NEON版本(ARM平台如果支持)
|
||||
*
|
||||
* @note 应该在程序启动早期调用,只需调用一次
|
||||
* @note 不是线程安全的,不应并发调用
|
||||
* @warning 重复调用可能导致重复注册
|
||||
*
|
||||
* 使用示例:
|
||||
* ```cpp
|
||||
* int main() {
|
||||
* // 初始化阶段
|
||||
* audio_processing_registry::register_all_functions();
|
||||
*
|
||||
* // 后续使用
|
||||
* auto mix_func = simd_func_dispatcher::get_function<...>("mix_audio");
|
||||
* mix_func(src1, src2, dst, samples);
|
||||
* return 0;
|
||||
* }
|
||||
* ```
|
||||
*/
|
||||
static void register_all_functions();
|
||||
|
||||
/**
|
||||
* @brief 打印所有可用的音频处理函数
|
||||
*
|
||||
* 输出所有已注册的音频处理函数及其实现版本。
|
||||
* 用于调试和验证函数注册是否成功。
|
||||
*
|
||||
* 输出格式示例:
|
||||
* ```
|
||||
* Available audio processing functions:
|
||||
* mix_audio:
|
||||
* - scalar (baseline)
|
||||
* - sse (4x SIMD)
|
||||
* - avx (8x SIMD)
|
||||
* apply_gain:
|
||||
* - scalar (baseline)
|
||||
* - neon (ARM SIMD)
|
||||
* ...
|
||||
* ```
|
||||
*
|
||||
* 应用场景:
|
||||
* - 验证SIMD函数是否正确注册
|
||||
* - 检查当前平台支持哪些优化版本
|
||||
* - 性能调试和分析
|
||||
*
|
||||
* @note 此函数仅用于调试,不影响程序功能
|
||||
* @note 输出会打印到标准输出或日志系统
|
||||
*/
|
||||
static void print_available_functions();
|
||||
};
|
||||
File diff suppressed because it is too large
Load Diff
8
src/simd/misc/CMakeLists.txt
Normal file
8
src/simd/misc/CMakeLists.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
project(alicho_simd)
|
||||
|
||||
simple_library(STATIC)
|
||||
target_link_libraries(${PROJECT_NAME} PUBLIC alicho_misc alicho_simd_interface)
|
||||
|
||||
if(UNIX AND NOT APPLE)
|
||||
target_link_libraries(${PROJECT_NAME} PUBLIC dl)
|
||||
endif()
|
||||
@@ -31,6 +31,7 @@
|
||||
#pragma once
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <cstddef>
|
||||
|
||||
/**
|
||||
* @namespace scalar_audio_processing_func
|
||||
@@ -87,6 +87,7 @@
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include <cstddef>
|
||||
|
||||
#if ALICHO_PLATFORM_X86
|
||||
/**
|
||||
@@ -293,9 +293,9 @@ void cpu_feature_detector::detect_x86_features() {
|
||||
if ((cpuid_7.ebx & (1 << 30)) != 0) { info_.features |= static_cast<uint32_t>(cpu_feature::AVX512BW); } // bit 30: AVX-512 Byte/Word(字节/字操作)
|
||||
if ((cpuid_7.ebx & (1 << 17)) != 0) { info_.features |= static_cast<uint32_t>(cpu_feature::AVX512DQ); } // bit 17: AVX-512 DQ(双字/四字操作)
|
||||
|
||||
// 从CPUID.7.0.ECX寄存提取更多AVX-512扩展特性
|
||||
// 从CPUID.7.0.ECX寄存器提取更多AVX-512扩展特性
|
||||
if ((cpuid_7.ecx & (1 << 21)) != 0) { info_.features |= static_cast<uint32_t>(cpu_feature::AVX512IFMA); } // bit 21: AVX-512 IFMA(整数融合乘加)
|
||||
if ((cpuid_7.ecx & (1 << 2)) != 0) { info_.features |= static_cast<uint32_t>(cpu_feature::AVX512VBMI); } // bit 1: AVX-512 VBMI(向量字节操作)
|
||||
if ((cpuid_7.ecx & (1 << 1)) != 0) { info_.features |= static_cast<uint32_t>(cpu_feature::AVX512VBMI); } // bit 1: AVX-512 VBMI(向量字节操作)
|
||||
|
||||
// ========================================================================
|
||||
// 步骤4: 确定最高可用的SIMD级别
|
||||
95
src/simd/misc/simd_api.h
Normal file
95
src/simd/misc/simd_api.h
Normal file
@@ -0,0 +1,95 @@
|
||||
#pragma once
|
||||
|
||||
#include "simd_func_dispatcher.h"
|
||||
|
||||
namespace simd {
|
||||
|
||||
// ============================================================================
|
||||
// SIMD函数的零开销包装接口
|
||||
//
|
||||
// 这些inline函数会被编译器优化为直接的函数指针调用,
|
||||
// 实现零开销的SIMD函数调度
|
||||
// ============================================================================
|
||||
|
||||
inline void fill_buffer(float* buffer, float value, size_t num_samples) {
|
||||
simd_func_dispatcher::instance().get_fill_buffer()(buffer, value, num_samples);
|
||||
}
|
||||
|
||||
inline void mix_audio(const float* src1, const float* src2,
|
||||
float* dst, size_t num_samples) {
|
||||
simd_func_dispatcher::instance().get_mix_audio()(src1, src2, dst, num_samples);
|
||||
}
|
||||
|
||||
inline void apply_gain(const float* src, float* dst,
|
||||
float gain, size_t num_samples) {
|
||||
simd_func_dispatcher::instance().get_apply_gain()(src, dst, gain, num_samples);
|
||||
}
|
||||
|
||||
inline float calculate_rms(const float* src, size_t num_samples) {
|
||||
return simd_func_dispatcher::instance().get_calculate_rms()(src, num_samples);
|
||||
}
|
||||
|
||||
inline float calculate_peak(const float* src, size_t num_samples) {
|
||||
return simd_func_dispatcher::instance().get_calculate_peak()(src, num_samples);
|
||||
}
|
||||
|
||||
inline void normalize_audio(const float* src, float* dst,
|
||||
float target_peak, size_t num_samples) {
|
||||
simd_func_dispatcher::instance().get_normalize_audio()(
|
||||
src, dst, target_peak, num_samples);
|
||||
}
|
||||
|
||||
inline void stereo_to_mono(const float* stereo_src, float* mono_dst,
|
||||
size_t num_stereo_samples) {
|
||||
simd_func_dispatcher::instance().get_stereo_to_mono()(
|
||||
stereo_src, mono_dst, num_stereo_samples);
|
||||
}
|
||||
|
||||
inline void limit_audio(const float* src, float* dst, float threshold,
|
||||
float* limiter_state, float sample_rate, size_t num_samples) {
|
||||
simd_func_dispatcher::instance().get_limit_audio()(
|
||||
src, dst, threshold, limiter_state, sample_rate, num_samples);
|
||||
}
|
||||
|
||||
inline void fade_audio(const float* src, float* dst, size_t fade_in_samples,
|
||||
size_t fade_out_samples, size_t num_samples) {
|
||||
simd_func_dispatcher::instance().get_fade_audio()(
|
||||
src, dst, fade_in_samples, fade_out_samples, num_samples);
|
||||
}
|
||||
|
||||
inline void simple_eq(const float* src, float* dst, float low_gain,
|
||||
float mid_gain, float high_gain, float* eq_state,
|
||||
size_t num_samples) {
|
||||
simd_func_dispatcher::instance().get_simple_eq()(
|
||||
src, dst, low_gain, mid_gain, high_gain, eq_state, num_samples);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// 调试和信息接口
|
||||
// ============================================================================
|
||||
|
||||
/// 获取当前激活的SIMD版本
|
||||
/// @return 当前使用的SIMD指令集版本
|
||||
inline auto get_active_simd_version() -> simd_func_version {
|
||||
return simd_func_dispatcher::instance().get_active_version();
|
||||
}
|
||||
|
||||
/// 获取当前激活的SIMD版本的字符串表示
|
||||
/// @return 版本名称(如 "AVX2", "NEON", "SCALAR" 等)
|
||||
inline auto get_active_simd_version_string() -> std::string {
|
||||
auto version = get_active_simd_version();
|
||||
switch (version) {
|
||||
case simd_func_version::SCALAR: return "SCALAR";
|
||||
case simd_func_version::SSE: return "SSE";
|
||||
case simd_func_version::SSE3: return "SSE3";
|
||||
case simd_func_version::SSE4: return "SSE4";
|
||||
case simd_func_version::AVX: return "AVX";
|
||||
case simd_func_version::AVX2: return "AVX2";
|
||||
case simd_func_version::AVX512: return "AVX512";
|
||||
case simd_func_version::NEON: return "NEON";
|
||||
case simd_func_version::NEON_FP16: return "NEON_FP16";
|
||||
default: return "UNKNOWN";
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace simd
|
||||
201
src/simd/misc/simd_func_dispatcher.cpp
Normal file
201
src/simd/misc/simd_func_dispatcher.cpp
Normal file
@@ -0,0 +1,201 @@
|
||||
#include "simd_func_dispatcher.h"
|
||||
#include "cpu_features.h"
|
||||
#include "logger.h"
|
||||
|
||||
#include <string_view>
|
||||
|
||||
#define SIMD_FUNC_DISPATCHER_LOG_MODULE "simd_func_dispatcher"
|
||||
|
||||
// 将 simd_level 映射到 simd_func_version
|
||||
static std::string_view version_to_lib_name(simd_func_version version) {
|
||||
switch (version) {
|
||||
case simd_func_version::SCALAR: return "scaler";
|
||||
case simd_func_version::SSE: return "sse";
|
||||
case simd_func_version::AVX: return "avx";
|
||||
case simd_func_version::AVX2: return "avx2";
|
||||
case simd_func_version::AVX512: return "avx512";
|
||||
case simd_func_version::NEON: return "neon";
|
||||
case simd_func_version::NEON_FP16: return "neon_fp16";
|
||||
default: return "";
|
||||
}
|
||||
}
|
||||
|
||||
simd_func_dispatcher::simd_func_dispatcher() {
|
||||
for (int i = 0; i < static_cast<int>(simd_func_version::COUNT); ++i) {
|
||||
auto version = static_cast<simd_func_version>(i);
|
||||
auto lib_suffix = version_to_lib_name(version);
|
||||
if (lib_suffix.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
std::string lib_name;
|
||||
#if ALICHO_PLATFORM_WINDOWS
|
||||
lib_name = "alicho_simd_" + std::string(lib_suffix) + ".dll";
|
||||
#elif ALICHO_PLATFORM_LINUX
|
||||
lib_name = "./libalicho_simd_" + std::string(lib_suffix) + ".so";
|
||||
#elif ALICHO_PLATFORM_APPLE
|
||||
lib_name = "./libalicho_simd_" + std::string(lib_suffix) + ".dylib";
|
||||
#endif
|
||||
|
||||
auto handle = std::make_unique<lib_handle>();
|
||||
if (handle->open(lib_name)) {
|
||||
log_module_info(SIMD_FUNC_DISPATCHER_LOG_MODULE, "Successfully loaded SIMD library: {}", lib_name);
|
||||
loaded_libraries_[version] = std::move(handle);
|
||||
} else {
|
||||
log_module_debug(SIMD_FUNC_DISPATCHER_LOG_MODULE, "Could not find or load SIMD library: {}", lib_name);
|
||||
}
|
||||
}
|
||||
|
||||
// 初始化函数指针
|
||||
initialize_function_pointers();
|
||||
}
|
||||
|
||||
simd_func_dispatcher::~simd_func_dispatcher() = default;
|
||||
|
||||
// 初始化函数指针
|
||||
void simd_func_dispatcher::initialize_function_pointers() {
|
||||
// 1. 检测CPU能力
|
||||
const auto& cpu_info = get_cpu_info();
|
||||
auto recommended_level = get_recommended_simd_level();
|
||||
auto preferred_version = simd_level_to_version(recommended_level);
|
||||
|
||||
log_module_info(SIMD_FUNC_DISPATCHER_LOG_MODULE, "检测到CPU最高SIMD级别: {}", static_cast<int>(cpu_info.max_simd_level));
|
||||
log_module_info(SIMD_FUNC_DISPATCHER_LOG_MODULE, "推荐使用SIMD级别: {}", static_cast<int>(recommended_level));
|
||||
|
||||
// 2. 找到可用版本
|
||||
auto target_version = find_fallback_version(preferred_version);
|
||||
|
||||
log_module_info(SIMD_FUNC_DISPATCHER_LOG_MODULE, "选择的SIMD版本: {}", static_cast<int>(target_version));
|
||||
|
||||
// 3. 加载函数指针
|
||||
if (!try_load_functions(target_version)) {
|
||||
throw std::runtime_error("Failed to load SIMD functions for any available version");
|
||||
}
|
||||
|
||||
active_version_ = target_version;
|
||||
log_module_info(SIMD_FUNC_DISPATCHER_LOG_MODULE, "成功初始化SIMD函数调度器,激活版本: {}", static_cast<int>(active_version_));
|
||||
}
|
||||
|
||||
// 回退策略
|
||||
auto simd_func_dispatcher::find_fallback_version(simd_func_version preferred) -> simd_func_version {
|
||||
// 定义回退序列
|
||||
#if ALICHO_PLATFORM_X86
|
||||
// x86/x64 回退序列:从高到低
|
||||
static const simd_func_version x86_fallback[] = {
|
||||
simd_func_version::AVX512,
|
||||
simd_func_version::AVX2,
|
||||
simd_func_version::AVX,
|
||||
simd_func_version::SSE,
|
||||
simd_func_version::SCALAR
|
||||
};
|
||||
|
||||
// 从首选版本开始查找
|
||||
bool found_preferred = false;
|
||||
for (auto version : x86_fallback) {
|
||||
if (version == preferred) {
|
||||
found_preferred = true;
|
||||
}
|
||||
|
||||
if (found_preferred && loaded_libraries_.count(version) > 0) {
|
||||
log_module_info(SIMD_FUNC_DISPATCHER_LOG_MODULE, "找到可用的SIMD版本: {}", static_cast<int>(version));
|
||||
return version;
|
||||
}
|
||||
}
|
||||
|
||||
#elif ALICHO_PLATFORM_ARM
|
||||
// ARM 回退序列
|
||||
static const simd_func_version arm_fallback[] = {
|
||||
simd_func_version::NEON_FP16,
|
||||
simd_func_version::NEON,
|
||||
simd_func_version::SCALAR
|
||||
};
|
||||
|
||||
bool found_preferred = false;
|
||||
for (auto version : arm_fallback) {
|
||||
if (version == preferred) {
|
||||
found_preferred = true;
|
||||
}
|
||||
|
||||
if (found_preferred && loaded_libraries_.count(version) > 0) {
|
||||
log_module_info(SIMD_FUNC_DISPATCHER_LOG_MODULE, "找到可用的SIMD版本: {}", static_cast<int>(version));
|
||||
return version;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// 最后回退到标量版本
|
||||
if (loaded_libraries_.count(simd_func_version::SCALAR) > 0) {
|
||||
log_module_info(SIMD_FUNC_DISPATCHER_LOG_MODULE, "回退到标量版本");
|
||||
return simd_func_version::SCALAR;
|
||||
}
|
||||
|
||||
throw std::runtime_error("No SIMD library available, not even scalar version");
|
||||
}
|
||||
|
||||
// 尝试从指定版本加载函数
|
||||
auto simd_func_dispatcher::try_load_functions(simd_func_version version) -> bool {
|
||||
// 检查库是否已加载
|
||||
auto it = loaded_libraries_.find(version);
|
||||
if (it == loaded_libraries_.end()) {
|
||||
log_module_error(SIMD_FUNC_DISPATCHER_LOG_MODULE, "SIMD库版本 {} 未加载", static_cast<int>(version));
|
||||
return false;
|
||||
}
|
||||
|
||||
auto* handle = it->second.get();
|
||||
|
||||
// 加载所有10个函数
|
||||
fill_buffer_ = load_function<fill_buffer_t>(handle, "fill_buffer");
|
||||
mix_audio_ = load_function<mix_audio_t>(handle, "mix_audio");
|
||||
apply_gain_ = load_function<apply_gain_t>(handle, "apply_gain");
|
||||
calculate_rms_ = load_function<calculate_rms_t>(handle, "calculate_rms");
|
||||
calculate_peak_ = load_function<calculate_peak_t>(handle, "calculate_peak");
|
||||
normalize_audio_ = load_function<normalize_audio_t>(handle, "normalize_audio");
|
||||
stereo_to_mono_ = load_function<stereo_to_mono_t>(handle, "stereo_to_mono");
|
||||
limit_audio_ = load_function<limit_audio_t>(handle, "limit_audio");
|
||||
fade_audio_ = load_function<fade_audio_t>(handle, "fade_audio");
|
||||
simple_eq_ = load_function<simple_eq_t>(handle, "simple_eq");
|
||||
|
||||
// 检查是否所有函数都加载成功
|
||||
if (!fill_buffer_ || !mix_audio_ || !apply_gain_ || !calculate_rms_ ||
|
||||
!calculate_peak_ || !normalize_audio_ || !stereo_to_mono_ ||
|
||||
!limit_audio_ || !fade_audio_ || !simple_eq_) {
|
||||
log_module_error(SIMD_FUNC_DISPATCHER_LOG_MODULE, "加载SIMD函数失败,版本: {}", static_cast<int>(version));
|
||||
return false;
|
||||
}
|
||||
|
||||
log_module_info(SIMD_FUNC_DISPATCHER_LOG_MODULE, "成功加载所有SIMD函数,版本: {}", static_cast<int>(version));
|
||||
return true;
|
||||
}
|
||||
|
||||
// 从lib_handle加载单个函数
|
||||
template<typename FuncT>
|
||||
auto simd_func_dispatcher::load_function(lib_handle* handle, const std::string& name) -> FuncT {
|
||||
if (!handle) {
|
||||
log_module_error(SIMD_FUNC_DISPATCHER_LOG_MODULE, "lib_handle 为空,无法加载函数: {}", name);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// FuncT 现在已经是函数指针类型 (如 void(*)(float*, float, size_t))
|
||||
// 移除指针得到函数签名类型 (如 void(float*, float, size_t))
|
||||
using func_signature = std::remove_pointer_t<FuncT>;
|
||||
|
||||
// 调用 lib_handle::get_function_by_name() 获取 std::function
|
||||
auto std_func = handle->get_function_by_name<func_signature>(name);
|
||||
|
||||
if (!std_func) {
|
||||
log_module_error(SIMD_FUNC_DISPATCHER_LOG_MODULE, "加载函数失败: {}", name);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// 从 std::function 获取底层函数指针
|
||||
// std::function::target<T>() 返回指向目标可调用对象的指针
|
||||
auto* func_ptr = std_func.template target<FuncT>();
|
||||
|
||||
if (!func_ptr || !*func_ptr) {
|
||||
log_module_error(SIMD_FUNC_DISPATCHER_LOG_MODULE, "转换函数指针失败: {}", name);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
log_module_debug(SIMD_FUNC_DISPATCHER_LOG_MODULE, "成功加载函数: {}", name);
|
||||
return *func_ptr;
|
||||
}
|
||||
223
src/simd/misc/simd_func_dispatcher.h
Normal file
223
src/simd/misc/simd_func_dispatcher.h
Normal file
@@ -0,0 +1,223 @@
|
||||
#pragma once
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <stdexcept>
|
||||
#include <unordered_map>
|
||||
#include "lib_handle.h"
|
||||
|
||||
#include "cpu_features.h"
|
||||
#include "simd_interface.h"
|
||||
|
||||
/**
|
||||
* @enum simd_func_version
|
||||
* @brief SIMD函数版本枚举 - 定义所有可能的函数实现版本
|
||||
*
|
||||
* 该枚举定义了函数可以有的所有SIMD优化版本。
|
||||
* 每个版本对应一个特定的SIMD指令集级别。
|
||||
*
|
||||
* 版本排序:
|
||||
* - 枚举值从低到高表示性能从弱到强
|
||||
* - SCALAR是最基础的版本,所有CPU都支持
|
||||
* - COUNT用于数组大小,不是实际版本
|
||||
*
|
||||
* 与simd_level的关系:
|
||||
* - simd_level表示CPU的能力级别
|
||||
* - simd_func_version表示函数的实现版本
|
||||
* - 通过simd_level_to_version()进行转换
|
||||
*
|
||||
* @note 不是所有函数都需要实现所有版本
|
||||
* @see simd_level, simd_level_to_version()
|
||||
*/
|
||||
enum class simd_func_version {
|
||||
/** 标量实现 - 纯C++代码,无SIMD优化
|
||||
* - 兼容性:所有CPU
|
||||
* - 性能:基准性能(1x)
|
||||
* - 用途:最低保底实现、参考实现
|
||||
* - 必要性:强制要求,作为回退版本
|
||||
*/
|
||||
SCALAR = 0,
|
||||
|
||||
/** SSE实现 - 使用SSE/SSE2指令
|
||||
* - 兼容性:2003年后的所有x86/x64
|
||||
* - 向量宽度:128位
|
||||
* - 性能提升:约2-4倍
|
||||
*/
|
||||
SSE,
|
||||
|
||||
/** SSE3实现 - 使用SSE3/SSSE3指令
|
||||
* - 兼容性:2006年后的主流CPU
|
||||
* - 新增功能:水平运算、复数支持
|
||||
* - 性能提升:比SSE快10-20%
|
||||
*/
|
||||
SSE3,
|
||||
|
||||
/** SSE4实现 - 使用SSE4.1/SSE4.2指令
|
||||
* - 兼容性:2008年后的主流CPU
|
||||
* - 新增功能:点积、blend、字符串处理
|
||||
* - 性能提升:比SSE3快15-30%
|
||||
*/
|
||||
SSE4,
|
||||
|
||||
/** AVX实现 - 使用AVX指令
|
||||
* - 兼容性:2011年后的主流CPU
|
||||
* - 向量宽度:256位
|
||||
* - 性能提升:约2倍SSE4性能
|
||||
*/
|
||||
AVX,
|
||||
|
||||
/** AVX2实现 - 使用AVX2 + FMA指令
|
||||
* - 兼容性:2013年后的主流CPU
|
||||
* - 新增功能:完整256位整数运算、FMA
|
||||
* - 性能提升:比AVX快50-100%
|
||||
* - 推荐:当前最佳性能/兼容性平衡点
|
||||
*/
|
||||
AVX2,
|
||||
|
||||
/** AVX-512实现 - 使用AVX-512指令集
|
||||
* - 兼容性:2016年后的高端CPU
|
||||
* - 向量宽度:512位
|
||||
* - 性能提升:约2倍AVX2性能(理论)
|
||||
* - 注意:可能导致CPU降频
|
||||
*/
|
||||
AVX512,
|
||||
|
||||
/** NEON实现 - 使用ARM NEON指令
|
||||
* - 兼容性:所有ARMv8-A (64位ARM)
|
||||
* - 向量宽度:128位
|
||||
* - 性能:与SSE4相当
|
||||
* - 应用:移动设备、Apple Silicon
|
||||
*/
|
||||
NEON,
|
||||
|
||||
/** NEON + FP16实现 - 使用NEON半精度浮点
|
||||
* - 兼容性:ARMv8.2-A及更新
|
||||
* - 新增:硬件FP16运算
|
||||
* - 性能:FP16运算快2倍
|
||||
* - 应用:移动端AI推理
|
||||
*/
|
||||
NEON_FP16,
|
||||
|
||||
/** RISC-V向量扩展实现
|
||||
* - 兼容性:支持RVV的RISC-V处理器
|
||||
* - 特点:可变向量长度
|
||||
* - 应用:嵌入式、IoT
|
||||
*/
|
||||
VECTOR,
|
||||
|
||||
/** 版本数量标记
|
||||
* 用于数组大小定义,不是实际的函数版本
|
||||
*/
|
||||
COUNT
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief 将SIMD级别转换为函数版本
|
||||
* @param level CPU的SIMD级别
|
||||
* @return 对应的函数版本枚举值
|
||||
*
|
||||
* 将cpu_feature_detector检测到的SIMD级别转换为
|
||||
* 函数调度器使用的版本标识。
|
||||
*
|
||||
* 映射关系:
|
||||
* - simd_level::NONE -> simd_func_version::SCALAR
|
||||
* - simd_level::SSE -> simd_func_version::SSE
|
||||
* - simd_level::AVX2 -> simd_func_version::AVX2
|
||||
* - 等等...
|
||||
*
|
||||
* @note constexpr函数,编译时求值,零运行时开销
|
||||
* @see simd_level, simd_func_version
|
||||
*/
|
||||
constexpr auto simd_level_to_version(simd_level level) {
|
||||
switch (level) {
|
||||
case simd_level::NONE:
|
||||
return simd_func_version::SCALAR;
|
||||
case simd_level::SSE:
|
||||
return simd_func_version::SSE;
|
||||
case simd_level::SSE3:
|
||||
return simd_func_version::SSE;
|
||||
case simd_level::SSE4:
|
||||
return simd_func_version::SSE;
|
||||
case simd_level::AVX:
|
||||
return simd_func_version::AVX;
|
||||
case simd_level::AVX2:
|
||||
return simd_func_version::AVX2;
|
||||
case simd_level::AVX512:
|
||||
return simd_func_version::AVX512;
|
||||
case simd_level::NEON:
|
||||
return simd_func_version::NEON;
|
||||
case simd_level::NEON_FP16:
|
||||
return simd_func_version::NEON_FP16;
|
||||
}
|
||||
|
||||
// 默认回退到标量版本
|
||||
return simd_func_version::SCALAR;
|
||||
}
|
||||
|
||||
class simd_func_dispatcher : public lazy_singleton<simd_func_dispatcher> {
|
||||
public:
|
||||
friend class lazy_singleton<simd_func_dispatcher>;
|
||||
|
||||
// 函数签名类型定义(使用 decltype 从 simd_interface.h 推导)
|
||||
using fill_buffer_t = decltype(&fill_buffer);
|
||||
using mix_audio_t = decltype(&mix_audio);
|
||||
using apply_gain_t = decltype(&apply_gain);
|
||||
using calculate_rms_t = decltype(&calculate_rms);
|
||||
using calculate_peak_t = decltype(&calculate_peak);
|
||||
using normalize_audio_t = decltype(&normalize_audio);
|
||||
using stereo_to_mono_t = decltype(&stereo_to_mono);
|
||||
using limit_audio_t = decltype(&limit_audio);
|
||||
using fade_audio_t = decltype(&fade_audio);
|
||||
using simple_eq_t = decltype(&simple_eq);
|
||||
|
||||
// 获取函数指针的接口
|
||||
[[nodiscard]] auto get_fill_buffer() const noexcept -> fill_buffer_t { return fill_buffer_; }
|
||||
[[nodiscard]] auto get_mix_audio() const noexcept -> mix_audio_t { return mix_audio_; }
|
||||
[[nodiscard]] auto get_apply_gain() const noexcept -> apply_gain_t { return apply_gain_; }
|
||||
[[nodiscard]] auto get_calculate_rms() const noexcept -> calculate_rms_t { return calculate_rms_; }
|
||||
[[nodiscard]] auto get_calculate_peak() const noexcept -> calculate_peak_t { return calculate_peak_; }
|
||||
[[nodiscard]] auto get_normalize_audio() const noexcept -> normalize_audio_t { return normalize_audio_; }
|
||||
[[nodiscard]] auto get_stereo_to_mono() const noexcept -> stereo_to_mono_t { return stereo_to_mono_; }
|
||||
[[nodiscard]] auto get_limit_audio() const noexcept -> limit_audio_t { return limit_audio_; }
|
||||
[[nodiscard]] auto get_fade_audio() const noexcept -> fade_audio_t { return fade_audio_; }
|
||||
[[nodiscard]] auto get_simple_eq() const noexcept -> simple_eq_t { return simple_eq_; }
|
||||
|
||||
[[nodiscard]] auto get_active_version() const noexcept -> simd_func_version { return active_version_; }
|
||||
|
||||
protected:
|
||||
simd_func_dispatcher();
|
||||
~simd_func_dispatcher() override;
|
||||
|
||||
private:
|
||||
// 初始化函数指针
|
||||
void initialize_function_pointers();
|
||||
|
||||
// 尝试从指定版本加载函数
|
||||
auto try_load_functions(simd_func_version version) -> bool;
|
||||
|
||||
// 回退策略
|
||||
auto find_fallback_version(simd_func_version preferred) -> simd_func_version;
|
||||
|
||||
// 从lib_handle加载单个函数
|
||||
template<typename FuncT>
|
||||
auto load_function(lib_handle* handle, const std::string& name) -> FuncT;
|
||||
|
||||
// 已加载的库映射
|
||||
std::unordered_map<simd_func_version, std::unique_ptr<lib_handle>> loaded_libraries_;
|
||||
|
||||
// 缓存的函数指针
|
||||
fill_buffer_t fill_buffer_ = nullptr;
|
||||
mix_audio_t mix_audio_ = nullptr;
|
||||
apply_gain_t apply_gain_ = nullptr;
|
||||
calculate_rms_t calculate_rms_ = nullptr;
|
||||
calculate_peak_t calculate_peak_ = nullptr;
|
||||
normalize_audio_t normalize_audio_ = nullptr;
|
||||
stereo_to_mono_t stereo_to_mono_ = nullptr;
|
||||
limit_audio_t limit_audio_ = nullptr;
|
||||
fade_audio_t fade_audio_ = nullptr;
|
||||
simple_eq_t simple_eq_ = nullptr;
|
||||
|
||||
// 当前激活的版本
|
||||
simd_func_version active_version_ = simd_func_version::SCALAR;
|
||||
};
|
||||
7
src/simd/simd_avx/CMakeLists.txt
Normal file
7
src/simd/simd_avx/CMakeLists.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
project(alicho_simd_avx)
|
||||
|
||||
if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64")
|
||||
simple_library(SHARED)
|
||||
target_compile_options(${PROJECT_NAME} PRIVATE -mavx2)
|
||||
target_link_libraries(${PROJECT_NAME} PUBLIC alicho_simd_interface)
|
||||
endif()
|
||||
761
src/simd/simd_avx/simd_func.cpp
Normal file
761
src/simd/simd_avx/simd_func.cpp
Normal file
@@ -0,0 +1,761 @@
|
||||
/**
|
||||
* @file x86_avx_audio_processing_func.cpp
|
||||
* @brief x86 AVX音频处理函数实现
|
||||
*/
|
||||
|
||||
#include "simd_interface.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <immintrin.h>
|
||||
#include "aligned_allocator.h"
|
||||
|
||||
#if ALICHO_PLATFORM_X86
|
||||
|
||||
extern "C"
|
||||
{
|
||||
SIMD_EXPORT void fill_buffer(float *buffer, float value, size_t num_samples)
|
||||
{
|
||||
ASSERT_ALIGNED(buffer, ALIGNMENT_AVX);
|
||||
|
||||
constexpr size_t simd_width = 8;
|
||||
constexpr size_t unroll_factor = 4;
|
||||
size_t i = 0;
|
||||
|
||||
auto value_vec = _mm256_set1_ps(value);
|
||||
|
||||
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
|
||||
{
|
||||
_mm256_store_ps(&buffer[i], value_vec);
|
||||
_mm256_store_ps(&buffer[i + 8], value_vec);
|
||||
_mm256_store_ps(&buffer[i + 16], value_vec);
|
||||
_mm256_store_ps(&buffer[i + 24], value_vec);
|
||||
}
|
||||
|
||||
for (; i + simd_width <= num_samples; i += simd_width)
|
||||
{
|
||||
_mm256_store_ps(&buffer[i], value_vec);
|
||||
}
|
||||
|
||||
for (; i < num_samples; ++i)
|
||||
{
|
||||
buffer[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_EXPORT void mix_audio(const float *src1, const float *src2, float *dst, size_t num_samples)
|
||||
{
|
||||
ASSERT_ALIGNED(src1, ALIGNMENT_AVX);
|
||||
ASSERT_ALIGNED(src2, ALIGNMENT_AVX);
|
||||
ASSERT_ALIGNED(dst, ALIGNMENT_AVX);
|
||||
|
||||
constexpr size_t simd_width = 8;
|
||||
constexpr size_t unroll_factor = 4;
|
||||
size_t i = 0;
|
||||
|
||||
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
|
||||
{
|
||||
auto a0 = _mm256_load_ps(&src1[i]);
|
||||
auto a1 = _mm256_load_ps(&src1[i + 8]);
|
||||
auto a2 = _mm256_load_ps(&src1[i + 16]);
|
||||
auto a3 = _mm256_load_ps(&src1[i + 24]);
|
||||
|
||||
auto b0 = _mm256_load_ps(&src2[i]);
|
||||
auto b1 = _mm256_load_ps(&src2[i + 8]);
|
||||
auto b2 = _mm256_load_ps(&src2[i + 16]);
|
||||
auto b3 = _mm256_load_ps(&src2[i + 24]);
|
||||
|
||||
auto result0 = _mm256_add_ps(a0, b0);
|
||||
auto result1 = _mm256_add_ps(a1, b1);
|
||||
auto result2 = _mm256_add_ps(a2, b2);
|
||||
auto result3 = _mm256_add_ps(a3, b3);
|
||||
|
||||
_mm256_store_ps(&dst[i], result0);
|
||||
_mm256_store_ps(&dst[i + 8], result1);
|
||||
_mm256_store_ps(&dst[i + 16], result2);
|
||||
_mm256_store_ps(&dst[i + 24], result3);
|
||||
}
|
||||
|
||||
for (; i + simd_width <= num_samples; i += simd_width)
|
||||
{
|
||||
auto a = _mm256_load_ps(&src1[i]);
|
||||
auto b = _mm256_load_ps(&src2[i]);
|
||||
auto result = _mm256_add_ps(a, b);
|
||||
_mm256_store_ps(&dst[i], result);
|
||||
}
|
||||
|
||||
for (; i < num_samples; ++i)
|
||||
{
|
||||
dst[i] = src1[i] + src2[i];
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_EXPORT void apply_gain(const float *src, float *dst, float gain, size_t num_samples)
|
||||
{
|
||||
ASSERT_ALIGNED(src, ALIGNMENT_AVX);
|
||||
ASSERT_ALIGNED(dst, ALIGNMENT_AVX);
|
||||
|
||||
constexpr size_t simd_width = 8;
|
||||
constexpr size_t unroll_factor = 4;
|
||||
size_t i = 0;
|
||||
|
||||
auto gain_vec = _mm256_set1_ps(gain);
|
||||
|
||||
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
|
||||
{
|
||||
auto a0 = _mm256_load_ps(&src[i]);
|
||||
auto a1 = _mm256_load_ps(&src[i + 8]);
|
||||
auto a2 = _mm256_load_ps(&src[i + 16]);
|
||||
auto a3 = _mm256_load_ps(&src[i + 24]);
|
||||
|
||||
auto result0 = _mm256_mul_ps(a0, gain_vec);
|
||||
auto result1 = _mm256_mul_ps(a1, gain_vec);
|
||||
auto result2 = _mm256_mul_ps(a2, gain_vec);
|
||||
auto result3 = _mm256_mul_ps(a3, gain_vec);
|
||||
|
||||
_mm256_store_ps(&dst[i], result0);
|
||||
_mm256_store_ps(&dst[i + 8], result1);
|
||||
_mm256_store_ps(&dst[i + 16], result2);
|
||||
_mm256_store_ps(&dst[i + 24], result3);
|
||||
}
|
||||
|
||||
for (; i + simd_width <= num_samples; i += simd_width)
|
||||
{
|
||||
auto a = _mm256_load_ps(&src[i]);
|
||||
auto result = _mm256_mul_ps(a, gain_vec);
|
||||
_mm256_store_ps(&dst[i], result);
|
||||
}
|
||||
|
||||
for (; i < num_samples; ++i)
|
||||
{
|
||||
dst[i] = src[i] * gain;
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_EXPORT float calculate_rms(const float *src, size_t num_samples)
|
||||
{
|
||||
ASSERT_ALIGNED(src, ALIGNMENT_AVX);
|
||||
|
||||
constexpr size_t simd_width = 8;
|
||||
constexpr size_t unroll_factor = 4;
|
||||
size_t i = 0;
|
||||
auto sum_squares0 = _mm256_setzero_ps();
|
||||
auto sum_squares1 = _mm256_setzero_ps();
|
||||
auto sum_squares2 = _mm256_setzero_ps();
|
||||
auto sum_squares3 = _mm256_setzero_ps();
|
||||
|
||||
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
|
||||
{
|
||||
const auto a0 = _mm256_load_ps(&src[i]);
|
||||
const auto a1 = _mm256_load_ps(&src[i + 8]);
|
||||
const auto a2 = _mm256_load_ps(&src[i + 16]);
|
||||
const auto a3 = _mm256_load_ps(&src[i + 24]);
|
||||
|
||||
const auto squared0 = _mm256_mul_ps(a0, a0);
|
||||
const auto squared1 = _mm256_mul_ps(a1, a1);
|
||||
const auto squared2 = _mm256_mul_ps(a2, a2);
|
||||
const auto squared3 = _mm256_mul_ps(a3, a3);
|
||||
|
||||
sum_squares0 = _mm256_add_ps(sum_squares0, squared0);
|
||||
sum_squares1 = _mm256_add_ps(sum_squares1, squared1);
|
||||
sum_squares2 = _mm256_add_ps(sum_squares2, squared2);
|
||||
sum_squares3 = _mm256_add_ps(sum_squares3, squared3);
|
||||
}
|
||||
|
||||
auto sum_squares = _mm256_add_ps(_mm256_add_ps(sum_squares0, sum_squares1),
|
||||
_mm256_add_ps(sum_squares2, sum_squares3));
|
||||
|
||||
for (; i + simd_width <= num_samples; i += simd_width)
|
||||
{
|
||||
const auto a = _mm256_load_ps(&src[i]);
|
||||
const auto squared = _mm256_mul_ps(a, a);
|
||||
sum_squares = _mm256_add_ps(sum_squares, squared);
|
||||
}
|
||||
|
||||
auto hadd1 = _mm256_hadd_ps(sum_squares, sum_squares);
|
||||
auto hadd2 = _mm256_hadd_ps(hadd1, hadd1);
|
||||
|
||||
auto low = _mm256_extractf128_ps(hadd2, 0);
|
||||
auto high = _mm256_extractf128_ps(hadd2, 1);
|
||||
auto final_sum = _mm_add_ps(low, high);
|
||||
double total_sum = _mm_cvtss_f32(final_sum);
|
||||
|
||||
for (; i < num_samples; ++i)
|
||||
{
|
||||
total_sum += static_cast<double>(src[i]) * static_cast<double>(src[i]);
|
||||
}
|
||||
|
||||
return static_cast<float>(std::sqrt(total_sum / static_cast<double>(num_samples)));
|
||||
}
|
||||
|
||||
SIMD_EXPORT float calculate_peak(const float *src, size_t num_samples)
|
||||
{
|
||||
ASSERT_ALIGNED(src, ALIGNMENT_AVX);
|
||||
|
||||
constexpr size_t simd_width = 8;
|
||||
constexpr size_t unroll_factor = 4;
|
||||
size_t i = 0;
|
||||
auto peak_vec0 = _mm256_setzero_ps();
|
||||
auto peak_vec1 = _mm256_setzero_ps();
|
||||
auto peak_vec2 = _mm256_setzero_ps();
|
||||
auto peak_vec3 = _mm256_setzero_ps();
|
||||
|
||||
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
|
||||
{
|
||||
const auto a0 = _mm256_load_ps(&src[i]);
|
||||
const auto a1 = _mm256_load_ps(&src[i + 8]);
|
||||
const auto a2 = _mm256_load_ps(&src[i + 16]);
|
||||
const auto a3 = _mm256_load_ps(&src[i + 24]);
|
||||
|
||||
const auto abs_a0 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a0);
|
||||
const auto abs_a1 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a1);
|
||||
const auto abs_a2 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a2);
|
||||
const auto abs_a3 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a3);
|
||||
|
||||
peak_vec0 = _mm256_max_ps(peak_vec0, abs_a0);
|
||||
peak_vec1 = _mm256_max_ps(peak_vec1, abs_a1);
|
||||
peak_vec2 = _mm256_max_ps(peak_vec2, abs_a2);
|
||||
peak_vec3 = _mm256_max_ps(peak_vec3, abs_a3);
|
||||
}
|
||||
|
||||
auto peak_vec = _mm256_max_ps(_mm256_max_ps(peak_vec0, peak_vec1),
|
||||
_mm256_max_ps(peak_vec2, peak_vec3));
|
||||
|
||||
for (; i + simd_width <= num_samples; i += simd_width)
|
||||
{
|
||||
const auto a = _mm256_load_ps(&src[i]);
|
||||
const auto abs_a = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a);
|
||||
peak_vec = _mm256_max_ps(peak_vec, abs_a);
|
||||
}
|
||||
|
||||
auto low = _mm256_extractf128_ps(peak_vec, 0);
|
||||
auto high = _mm256_extractf128_ps(peak_vec, 1);
|
||||
auto max_lane = _mm_max_ps(low, high);
|
||||
|
||||
auto temp1 = _mm_shuffle_ps(max_lane, max_lane, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
auto max1 = _mm_max_ps(max_lane, temp1);
|
||||
auto temp2 = _mm_shuffle_ps(max1, max1, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
auto final_max = _mm_max_ps(max1, temp2);
|
||||
float peak = _mm_cvtss_f32(final_max);
|
||||
|
||||
for (; i < num_samples; ++i)
|
||||
{
|
||||
float abs_sample = std::fabs(src[i]);
|
||||
if (abs_sample > peak)
|
||||
{
|
||||
peak = abs_sample;
|
||||
}
|
||||
}
|
||||
|
||||
return peak;
|
||||
}
|
||||
|
||||
SIMD_EXPORT void normalize_audio(const float *src, float *dst, float target_peak, size_t num_samples)
|
||||
{
|
||||
ASSERT_ALIGNED(src, ALIGNMENT_AVX);
|
||||
ASSERT_ALIGNED(dst, ALIGNMENT_AVX);
|
||||
|
||||
if (num_samples == 0 || target_peak <= 0.0f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
const float current_peak = calculate_peak(src, num_samples);
|
||||
|
||||
if (current_peak < 1e-10f)
|
||||
{
|
||||
constexpr size_t simd_width = 8;
|
||||
constexpr size_t unroll_factor = 4;
|
||||
auto zero_vec = _mm256_setzero_ps();
|
||||
size_t i = 0;
|
||||
|
||||
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
|
||||
{
|
||||
_mm256_store_ps(&dst[i], zero_vec);
|
||||
_mm256_store_ps(&dst[i + 8], zero_vec);
|
||||
_mm256_store_ps(&dst[i + 16], zero_vec);
|
||||
_mm256_store_ps(&dst[i + 24], zero_vec);
|
||||
}
|
||||
|
||||
for (; i + simd_width <= num_samples; i += simd_width)
|
||||
{
|
||||
_mm256_store_ps(&dst[i], zero_vec);
|
||||
}
|
||||
|
||||
for (; i < num_samples; ++i)
|
||||
{
|
||||
dst[i] = 0.0f;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
const float gain_factor = target_peak / current_peak;
|
||||
apply_gain(src, dst, gain_factor, num_samples);
|
||||
}
|
||||
|
||||
SIMD_EXPORT void stereo_to_mono(const float *stereo_src, float *mono_dst, size_t num_stereo_samples)
|
||||
{
|
||||
ASSERT_ALIGNED(stereo_src, ALIGNMENT_AVX);
|
||||
ASSERT_ALIGNED(mono_dst, ALIGNMENT_AVX);
|
||||
|
||||
if (num_stereo_samples == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr size_t simd_width = 8;
|
||||
constexpr size_t unroll_factor = 4;
|
||||
const auto half_vec = _mm256_set1_ps(0.5f);
|
||||
size_t stereo_idx = 0;
|
||||
size_t mono_idx = 0;
|
||||
|
||||
for (; stereo_idx + simd_width * 2 * unroll_factor <= num_stereo_samples * 2;
|
||||
stereo_idx += simd_width * 2 * unroll_factor, mono_idx += simd_width * unroll_factor)
|
||||
{
|
||||
auto stereo0 = _mm256_load_ps(&stereo_src[stereo_idx]);
|
||||
auto stereo1 = _mm256_load_ps(&stereo_src[stereo_idx + 8]);
|
||||
auto stereo2 = _mm256_load_ps(&stereo_src[stereo_idx + 16]);
|
||||
auto stereo3 = _mm256_load_ps(&stereo_src[stereo_idx + 24]);
|
||||
auto stereo4 = _mm256_load_ps(&stereo_src[stereo_idx + 32]);
|
||||
auto stereo5 = _mm256_load_ps(&stereo_src[stereo_idx + 40]);
|
||||
auto stereo6 = _mm256_load_ps(&stereo_src[stereo_idx + 48]);
|
||||
auto stereo7 = _mm256_load_ps(&stereo_src[stereo_idx + 56]);
|
||||
|
||||
auto left0 = _mm256_shuffle_ps(stereo0, stereo1, _MM_SHUFFLE(2, 0, 2, 0));
|
||||
auto right0 = _mm256_shuffle_ps(stereo0, stereo1, _MM_SHUFFLE(3, 1, 3, 1));
|
||||
auto left1 = _mm256_shuffle_ps(stereo2, stereo3, _MM_SHUFFLE(2, 0, 2, 0));
|
||||
auto right1 = _mm256_shuffle_ps(stereo2, stereo3, _MM_SHUFFLE(3, 1, 3, 1));
|
||||
auto left2 = _mm256_shuffle_ps(stereo4, stereo5, _MM_SHUFFLE(2, 0, 2, 0));
|
||||
auto right2 = _mm256_shuffle_ps(stereo4, stereo5, _MM_SHUFFLE(3, 1, 3, 1));
|
||||
auto left3 = _mm256_shuffle_ps(stereo6, stereo7, _MM_SHUFFLE(2, 0, 2, 0));
|
||||
auto right3 = _mm256_shuffle_ps(stereo6, stereo7, _MM_SHUFFLE(3, 1, 3, 1));
|
||||
|
||||
left0 = _mm256_permute2f128_ps(left0, left0, 0x01);
|
||||
right0 = _mm256_permute2f128_ps(right0, right0, 0x01);
|
||||
left1 = _mm256_permute2f128_ps(left1, left1, 0x01);
|
||||
right1 = _mm256_permute2f128_ps(right1, right1, 0x01);
|
||||
left2 = _mm256_permute2f128_ps(left2, left2, 0x01);
|
||||
right2 = _mm256_permute2f128_ps(right2, right2, 0x01);
|
||||
left3 = _mm256_permute2f128_ps(left3, left3, 0x01);
|
||||
right3 = _mm256_permute2f128_ps(right3, right3, 0x01);
|
||||
|
||||
auto mono0 = _mm256_mul_ps(_mm256_add_ps(left0, right0), half_vec);
|
||||
auto mono1 = _mm256_mul_ps(_mm256_add_ps(left1, right1), half_vec);
|
||||
auto mono2 = _mm256_mul_ps(_mm256_add_ps(left2, right2), half_vec);
|
||||
auto mono3 = _mm256_mul_ps(_mm256_add_ps(left3, right3), half_vec);
|
||||
|
||||
_mm256_store_ps(&mono_dst[mono_idx], mono0);
|
||||
_mm256_store_ps(&mono_dst[mono_idx + 8], mono1);
|
||||
_mm256_store_ps(&mono_dst[mono_idx + 16], mono2);
|
||||
_mm256_store_ps(&mono_dst[mono_idx + 24], mono3);
|
||||
}
|
||||
|
||||
for (size_t i = stereo_idx / 2; i < num_stereo_samples; ++i)
|
||||
{
|
||||
const float left = stereo_src[i * 2];
|
||||
const float right = stereo_src[i * 2 + 1];
|
||||
mono_dst[i] = (left + right) * 0.5f;
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_EXPORT void limit_audio(const float *src, float *dst, float threshold, float *limiter_state, float sample_rate,
|
||||
size_t num_samples)
|
||||
{
|
||||
ASSERT_ALIGNED(src, ALIGNMENT_AVX);
|
||||
ASSERT_ALIGNED(dst, ALIGNMENT_AVX);
|
||||
|
||||
if (num_samples == 0 || threshold <= 0.0f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr size_t simd_width = 8;
|
||||
constexpr size_t unroll_factor = 4;
|
||||
constexpr float release_time = 0.05f;
|
||||
float release_coeff = std::exp(-1.0f / (release_time * sample_rate));
|
||||
|
||||
float current_gain = limiter_state != nullptr ? *limiter_state : 1.0f;
|
||||
|
||||
size_t i = 0;
|
||||
|
||||
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
|
||||
{
|
||||
auto a0 = _mm256_load_ps(&src[i]);
|
||||
auto a1 = _mm256_load_ps(&src[i + 8]);
|
||||
auto a2 = _mm256_load_ps(&src[i + 16]);
|
||||
auto a3 = _mm256_load_ps(&src[i + 24]);
|
||||
|
||||
auto abs_a0 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a0);
|
||||
auto abs_a1 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a1);
|
||||
auto abs_a2 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a2);
|
||||
auto abs_a3 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a3);
|
||||
|
||||
auto max_abs = _mm256_max_ps(_mm256_max_ps(abs_a0, abs_a1),
|
||||
_mm256_max_ps(abs_a2, abs_a3));
|
||||
|
||||
auto high = _mm256_extractf128_ps(max_abs, 1);
|
||||
auto low = _mm256_extractf128_ps(max_abs, 0);
|
||||
auto max_lane = _mm_max_ps(high, low);
|
||||
|
||||
auto temp1 = _mm_shuffle_ps(max_lane, max_lane, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
auto max1 = _mm_max_ps(max_lane, temp1);
|
||||
auto temp2 = _mm_shuffle_ps(max1, max1, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
auto final_max = _mm_max_ps(max1, temp2);
|
||||
|
||||
float max_sample = _mm_cvtss_f32(final_max);
|
||||
|
||||
float target_gain = max_sample > threshold ? threshold / max_sample : 1.0f;
|
||||
|
||||
if (target_gain < current_gain)
|
||||
{
|
||||
current_gain = target_gain;
|
||||
}
|
||||
else
|
||||
{
|
||||
current_gain = target_gain + (current_gain - target_gain) * release_coeff;
|
||||
}
|
||||
|
||||
auto gain_vec = _mm256_set1_ps(current_gain);
|
||||
|
||||
auto result0 = _mm256_mul_ps(a0, gain_vec);
|
||||
auto result1 = _mm256_mul_ps(a1, gain_vec);
|
||||
auto result2 = _mm256_mul_ps(a2, gain_vec);
|
||||
auto result3 = _mm256_mul_ps(a3, gain_vec);
|
||||
|
||||
_mm256_store_ps(&dst[i], result0);
|
||||
_mm256_store_ps(&dst[i + 8], result1);
|
||||
_mm256_store_ps(&dst[i + 16], result2);
|
||||
_mm256_store_ps(&dst[i + 24], result3);
|
||||
}
|
||||
|
||||
for (; i + simd_width <= num_samples; i += simd_width)
|
||||
{
|
||||
auto a = _mm256_load_ps(&src[i]);
|
||||
auto abs_a = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a);
|
||||
|
||||
auto high = _mm256_extractf128_ps(abs_a, 1);
|
||||
auto low = _mm256_extractf128_ps(abs_a, 0);
|
||||
auto max_lane = _mm_max_ps(high, low);
|
||||
|
||||
auto temp1 = _mm_shuffle_ps(max_lane, max_lane, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
auto max1 = _mm_max_ps(max_lane, temp1);
|
||||
auto temp2 = _mm_shuffle_ps(max1, max1, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
auto final_max = _mm_max_ps(max1, temp2);
|
||||
|
||||
float max_sample = _mm_cvtss_f32(final_max);
|
||||
|
||||
float target_gain = max_sample > threshold ? threshold / max_sample : 1.0f;
|
||||
|
||||
if (target_gain < current_gain)
|
||||
{
|
||||
current_gain = target_gain;
|
||||
}
|
||||
else
|
||||
{
|
||||
current_gain = target_gain + (current_gain - target_gain) * release_coeff;
|
||||
}
|
||||
|
||||
auto gain_vec = _mm256_set1_ps(current_gain);
|
||||
auto result = _mm256_mul_ps(a, gain_vec);
|
||||
_mm256_store_ps(&dst[i], result);
|
||||
}
|
||||
|
||||
for (; i < num_samples; ++i)
|
||||
{
|
||||
float sample = src[i];
|
||||
float abs_sample = std::fabs(sample);
|
||||
|
||||
float target_gain = abs_sample > threshold ? threshold / abs_sample : 1.0f;
|
||||
|
||||
if (target_gain < current_gain)
|
||||
{
|
||||
current_gain = target_gain;
|
||||
}
|
||||
else
|
||||
{
|
||||
current_gain = target_gain + (current_gain - target_gain) * release_coeff;
|
||||
}
|
||||
|
||||
dst[i] = sample * current_gain;
|
||||
}
|
||||
|
||||
if (limiter_state != nullptr)
|
||||
{
|
||||
*limiter_state = current_gain;
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_EXPORT void fade_audio(const float *src, float *dst, size_t fade_in_samples, size_t fade_out_samples,
|
||||
size_t num_samples)
|
||||
{
|
||||
ASSERT_ALIGNED(src, ALIGNMENT_AVX);
|
||||
ASSERT_ALIGNED(dst, ALIGNMENT_AVX);
|
||||
|
||||
if (num_samples == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr size_t simd_width = 8;
|
||||
constexpr size_t unroll_factor = 4;
|
||||
size_t i = 0;
|
||||
|
||||
if (fade_in_samples > 0)
|
||||
{
|
||||
const float fade_in_step = 1.0f / static_cast<float>(fade_in_samples);
|
||||
|
||||
for (; i + simd_width * unroll_factor <= std::min(fade_in_samples, num_samples); i += simd_width *
|
||||
unroll_factor)
|
||||
{
|
||||
auto gain0 = _mm256_set_ps((i + 7) * fade_in_step, (i + 6) * fade_in_step, (i + 5) * fade_in_step,
|
||||
(i + 4) * fade_in_step,
|
||||
(i + 3) * fade_in_step, (i + 2) * fade_in_step, (i + 1) * fade_in_step,
|
||||
i * fade_in_step);
|
||||
auto gain1 = _mm256_set_ps((i + 15) * fade_in_step, (i + 14) * fade_in_step, (i + 13) * fade_in_step,
|
||||
(i + 12) * fade_in_step,
|
||||
(i + 11) * fade_in_step, (i + 10) * fade_in_step, (i + 9) * fade_in_step,
|
||||
(i + 8) * fade_in_step);
|
||||
auto gain2 = _mm256_set_ps((i + 23) * fade_in_step, (i + 22) * fade_in_step, (i + 21) * fade_in_step,
|
||||
(i + 20) * fade_in_step,
|
||||
(i + 19) * fade_in_step, (i + 18) * fade_in_step, (i + 17) * fade_in_step,
|
||||
(i + 16) * fade_in_step);
|
||||
auto gain3 = _mm256_set_ps((i + 31) * fade_in_step, (i + 30) * fade_in_step, (i + 29) * fade_in_step,
|
||||
(i + 28) * fade_in_step,
|
||||
(i + 27) * fade_in_step, (i + 26) * fade_in_step, (i + 25) * fade_in_step,
|
||||
(i + 24) * fade_in_step);
|
||||
|
||||
auto a0 = _mm256_load_ps(&src[i]);
|
||||
auto a1 = _mm256_load_ps(&src[i + 8]);
|
||||
auto a2 = _mm256_load_ps(&src[i + 16]);
|
||||
auto a3 = _mm256_load_ps(&src[i + 24]);
|
||||
|
||||
auto result0 = _mm256_mul_ps(a0, gain0);
|
||||
auto result1 = _mm256_mul_ps(a1, gain1);
|
||||
auto result2 = _mm256_mul_ps(a2, gain2);
|
||||
auto result3 = _mm256_mul_ps(a3, gain3);
|
||||
|
||||
_mm256_store_ps(&dst[i], result0);
|
||||
_mm256_store_ps(&dst[i + 8], result1);
|
||||
_mm256_store_ps(&dst[i + 16], result2);
|
||||
_mm256_store_ps(&dst[i + 24], result3);
|
||||
}
|
||||
|
||||
for (; i < std::min(fade_in_samples, num_samples); ++i)
|
||||
{
|
||||
const float gain = static_cast<float>(i) / static_cast<float>(fade_in_samples);
|
||||
dst[i] = src[i] * gain;
|
||||
}
|
||||
}
|
||||
|
||||
const size_t middle_start = fade_in_samples;
|
||||
const size_t middle_end = num_samples > fade_out_samples ? num_samples - fade_out_samples : 0;
|
||||
|
||||
if (middle_end > middle_start)
|
||||
{
|
||||
for (size_t j = middle_start; j + simd_width * unroll_factor <= middle_end; j += simd_width *
|
||||
unroll_factor)
|
||||
{
|
||||
auto a0 = _mm256_load_ps(&src[j]);
|
||||
auto a1 = _mm256_load_ps(&src[j + 8]);
|
||||
auto a2 = _mm256_load_ps(&src[j + 16]);
|
||||
auto a3 = _mm256_load_ps(&src[j + 24]);
|
||||
|
||||
_mm256_store_ps(&dst[j], a0);
|
||||
_mm256_store_ps(&dst[j + 8], a1);
|
||||
_mm256_store_ps(&dst[j + 16], a2);
|
||||
_mm256_store_ps(&dst[j + 24], a3);
|
||||
}
|
||||
|
||||
for (size_t j = middle_start + ((middle_end - middle_start) / (simd_width * unroll_factor)) * (simd_width *
|
||||
unroll_factor);
|
||||
j < middle_end; ++j)
|
||||
{
|
||||
dst[j] = src[j];
|
||||
}
|
||||
}
|
||||
|
||||
if (fade_out_samples > 0 && num_samples > fade_out_samples)
|
||||
{
|
||||
const size_t fade_out_start = num_samples - fade_out_samples;
|
||||
const float fade_out_step = 1.0f / static_cast<float>(fade_out_samples);
|
||||
|
||||
for (size_t j = fade_out_start; j + simd_width * unroll_factor <= num_samples; j += simd_width *
|
||||
unroll_factor)
|
||||
{
|
||||
const size_t fade_out_offset = j - fade_out_start;
|
||||
auto gain0 = _mm256_set_ps(1.0f - (fade_out_offset + 7) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 6) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 5) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 4) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 3) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 2) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 1) * fade_out_step,
|
||||
1.0f - fade_out_offset * fade_out_step);
|
||||
auto gain1 = _mm256_set_ps(1.0f - (fade_out_offset + 15) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 14) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 13) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 12) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 11) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 10) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 9) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 8) * fade_out_step);
|
||||
auto gain2 = _mm256_set_ps(1.0f - (fade_out_offset + 23) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 22) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 21) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 20) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 19) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 18) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 17) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 16) * fade_out_step);
|
||||
auto gain3 = _mm256_set_ps(1.0f - (fade_out_offset + 31) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 30) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 29) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 28) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 27) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 26) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 25) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 24) * fade_out_step);
|
||||
|
||||
auto a0 = _mm256_load_ps(&src[j]);
|
||||
auto a1 = _mm256_load_ps(&src[j + 8]);
|
||||
auto a2 = _mm256_load_ps(&src[j + 16]);
|
||||
auto a3 = _mm256_load_ps(&src[j + 24]);
|
||||
|
||||
auto result0 = _mm256_mul_ps(a0, gain0);
|
||||
auto result1 = _mm256_mul_ps(a1, gain1);
|
||||
auto result2 = _mm256_mul_ps(a2, gain2);
|
||||
auto result3 = _mm256_mul_ps(a3, gain3);
|
||||
|
||||
_mm256_store_ps(&dst[j], result0);
|
||||
_mm256_store_ps(&dst[j + 8], result1);
|
||||
_mm256_store_ps(&dst[j + 16], result2);
|
||||
_mm256_store_ps(&dst[j + 24], result3);
|
||||
}
|
||||
|
||||
for (size_t j = fade_out_start + ((fade_out_samples / (simd_width * unroll_factor)) * (simd_width *
|
||||
unroll_factor));
|
||||
j < num_samples; ++j)
|
||||
{
|
||||
const size_t fade_out_offset = j - fade_out_start;
|
||||
const float gain = 1.0f - static_cast<float>(fade_out_offset) / static_cast<float>(fade_out_samples);
|
||||
dst[j] = src[j] * gain;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_EXPORT void simple_eq(const float *src, float *dst, float low_gain, float mid_gain, float high_gain, float *eq_state,
|
||||
size_t num_samples)
|
||||
{
|
||||
ASSERT_ALIGNED(src, ALIGNMENT_AVX);
|
||||
ASSERT_ALIGNED(dst, ALIGNMENT_AVX);
|
||||
|
||||
if (num_samples == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr size_t simd_width = 8;
|
||||
constexpr size_t unroll_factor = 4;
|
||||
|
||||
constexpr float low_cutoff = 0.02f;
|
||||
constexpr float high_cutoff = 0.1f;
|
||||
constexpr float mid_factor = 0.7f;
|
||||
|
||||
float low_state = eq_state != nullptr ? *eq_state : 0.0f;
|
||||
float high_state = eq_state != nullptr ? *(eq_state + 1) : 0.0f;
|
||||
|
||||
const auto low_gain_vec = _mm256_set1_ps(low_gain);
|
||||
const auto mid_gain_vec = _mm256_set1_ps(mid_gain);
|
||||
const auto high_gain_vec = _mm256_set1_ps(high_gain);
|
||||
const auto low_cutoff_vec = _mm256_set1_ps(low_cutoff);
|
||||
const auto high_cutoff_vec = _mm256_set1_ps(high_cutoff);
|
||||
const auto mid_factor_vec = _mm256_set1_ps(mid_factor);
|
||||
const auto one_minus_low_cutoff_vec = _mm256_set1_ps(1.0f - low_cutoff);
|
||||
const auto one_minus_high_cutoff_vec = _mm256_set1_ps(1.0f - high_cutoff);
|
||||
|
||||
size_t i = 0;
|
||||
|
||||
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
|
||||
{
|
||||
auto input0 = _mm256_load_ps(&src[i]);
|
||||
auto input1 = _mm256_load_ps(&src[i + 8]);
|
||||
auto input2 = _mm256_load_ps(&src[i + 16]);
|
||||
auto input3 = _mm256_load_ps(&src[i + 24]);
|
||||
|
||||
auto low_state_vec = _mm256_set1_ps(low_state);
|
||||
auto low0 = _mm256_add_ps(_mm256_mul_ps(input0, low_cutoff_vec),
|
||||
_mm256_mul_ps(low_state_vec, one_minus_low_cutoff_vec));
|
||||
auto low1 = _mm256_add_ps(_mm256_mul_ps(input1, low_cutoff_vec),
|
||||
_mm256_mul_ps(low0, one_minus_low_cutoff_vec));
|
||||
auto low2 = _mm256_add_ps(_mm256_mul_ps(input2, low_cutoff_vec),
|
||||
_mm256_mul_ps(low1, one_minus_low_cutoff_vec));
|
||||
auto low3 = _mm256_add_ps(_mm256_mul_ps(input3, low_cutoff_vec),
|
||||
_mm256_mul_ps(low2, one_minus_low_cutoff_vec));
|
||||
|
||||
auto high0 = _mm256_sub_ps(input0, low0);
|
||||
auto high1 = _mm256_sub_ps(input1, low1);
|
||||
auto high2 = _mm256_sub_ps(input2, low2);
|
||||
auto high3 = _mm256_sub_ps(input3, low3);
|
||||
|
||||
auto high_state_vec = _mm256_set1_ps(high_state);
|
||||
high0 = _mm256_add_ps(_mm256_mul_ps(high0, high_cutoff_vec),
|
||||
_mm256_mul_ps(high_state_vec, one_minus_high_cutoff_vec));
|
||||
high1 = _mm256_add_ps(_mm256_mul_ps(high1, high_cutoff_vec),
|
||||
_mm256_mul_ps(high0, one_minus_high_cutoff_vec));
|
||||
high2 = _mm256_add_ps(_mm256_mul_ps(high2, high_cutoff_vec),
|
||||
_mm256_mul_ps(high1, one_minus_high_cutoff_vec));
|
||||
high3 = _mm256_add_ps(_mm256_mul_ps(high3, high_cutoff_vec),
|
||||
_mm256_mul_ps(high2, one_minus_high_cutoff_vec));
|
||||
|
||||
auto mid0 = _mm256_mul_ps(_mm256_sub_ps(_mm256_sub_ps(input0, low0), high0), mid_factor_vec);
|
||||
auto mid1 = _mm256_mul_ps(_mm256_sub_ps(_mm256_sub_ps(input1, low1), high1), mid_factor_vec);
|
||||
auto mid2 = _mm256_mul_ps(_mm256_sub_ps(_mm256_sub_ps(input2, low2), high2), mid_factor_vec);
|
||||
auto mid3 = _mm256_mul_ps(_mm256_sub_ps(_mm256_sub_ps(input3, low3), high3), mid_factor_vec);
|
||||
|
||||
auto result0 = _mm256_add_ps(
|
||||
_mm256_add_ps(_mm256_mul_ps(low0, low_gain_vec), _mm256_mul_ps(mid0, mid_gain_vec)),
|
||||
_mm256_mul_ps(high0, high_gain_vec));
|
||||
auto result1 = _mm256_add_ps(
|
||||
_mm256_add_ps(_mm256_mul_ps(low1, low_gain_vec), _mm256_mul_ps(mid1, mid_gain_vec)),
|
||||
_mm256_mul_ps(high1, high_gain_vec));
|
||||
auto result2 = _mm256_add_ps(
|
||||
_mm256_add_ps(_mm256_mul_ps(low2, low_gain_vec), _mm256_mul_ps(mid2, mid_gain_vec)),
|
||||
_mm256_mul_ps(high2, high_gain_vec));
|
||||
auto result3 = _mm256_add_ps(
|
||||
_mm256_add_ps(_mm256_mul_ps(low3, low_gain_vec), _mm256_mul_ps(mid3, mid_gain_vec)),
|
||||
_mm256_mul_ps(high3, high_gain_vec));
|
||||
|
||||
_mm256_store_ps(&dst[i], result0);
|
||||
_mm256_store_ps(&dst[i + 8], result1);
|
||||
_mm256_store_ps(&dst[i + 16], result2);
|
||||
_mm256_store_ps(&dst[i + 24], result3);
|
||||
|
||||
auto low_temp = _mm256_extractf128_ps(low3, 1);
|
||||
low_state = _mm_cvtss_f32(_mm_shuffle_ps(low_temp, low_temp, _MM_SHUFFLE(3, 3, 3, 3)));
|
||||
auto high_temp = _mm256_extractf128_ps(high3, 1);
|
||||
high_state = _mm_cvtss_f32(_mm_shuffle_ps(high_temp, high_temp, _MM_SHUFFLE(3, 3, 3, 3)));
|
||||
}
|
||||
|
||||
for (; i < num_samples; ++i)
|
||||
{
|
||||
float input = src[i];
|
||||
|
||||
float low_output = low_cutoff * input + (1.0f - low_cutoff) * low_state;
|
||||
low_state = low_output;
|
||||
|
||||
float high_input = input - low_output;
|
||||
float high_output = high_cutoff * high_input + (1.0f - high_cutoff) * high_state;
|
||||
high_state = high_output;
|
||||
|
||||
float mid_output = (input - low_output - high_output) * mid_factor;
|
||||
|
||||
dst[i] = low_output * low_gain + mid_output * mid_gain + high_output * high_gain;
|
||||
}
|
||||
|
||||
if (eq_state != nullptr)
|
||||
{
|
||||
*eq_state = low_state;
|
||||
*(eq_state + 1) = high_state;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
7
src/simd/simd_avx512/CMakeLists.txt
Normal file
7
src/simd/simd_avx512/CMakeLists.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
project(alicho_simd_avx512)
|
||||
|
||||
if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64")
|
||||
simple_library(SHARED)
|
||||
target_compile_options(${PROJECT_NAME} PRIVATE -mavx512f -mavx512bw -mavx512vl -mavx512dq)
|
||||
target_link_libraries(${PROJECT_NAME} PUBLIC alicho_simd_interface)
|
||||
endif()
|
||||
756
src/simd/simd_avx512/simd_func.cpp
Normal file
756
src/simd/simd_avx512/simd_func.cpp
Normal file
@@ -0,0 +1,756 @@
|
||||
/**
|
||||
* @file x86_avx512_audio_processing_func.cpp
|
||||
* @brief x86 AVX-512音频处理函数实现
|
||||
*/
|
||||
|
||||
#include "simd_interface.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <immintrin.h>
|
||||
#include "aligned_allocator.h"
|
||||
|
||||
extern "C"
|
||||
{
|
||||
SIMD_EXPORT void fill_buffer(float *buffer, float value, size_t num_samples)
|
||||
{
|
||||
ASSERT_ALIGNED(buffer, ALIGNMENT_AVX512);
|
||||
|
||||
constexpr size_t simd_width = 16;
|
||||
constexpr size_t unroll_factor = 4;
|
||||
size_t i = 0;
|
||||
|
||||
auto value_vec = _mm512_set1_ps(value);
|
||||
|
||||
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
|
||||
{
|
||||
_mm512_store_ps(&buffer[i], value_vec);
|
||||
_mm512_store_ps(&buffer[i + 16], value_vec);
|
||||
_mm512_store_ps(&buffer[i + 32], value_vec);
|
||||
_mm512_store_ps(&buffer[i + 48], value_vec);
|
||||
}
|
||||
|
||||
for (; i + simd_width <= num_samples; i += simd_width)
|
||||
{
|
||||
_mm512_store_ps(&buffer[i], value_vec);
|
||||
}
|
||||
|
||||
for (; i < num_samples; ++i)
|
||||
{
|
||||
buffer[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_EXPORT void mix_audio(const float *src1, const float *src2, float *dst, size_t num_samples)
|
||||
{
|
||||
ASSERT_ALIGNED(src1, ALIGNMENT_AVX512);
|
||||
ASSERT_ALIGNED(src2, ALIGNMENT_AVX512);
|
||||
ASSERT_ALIGNED(dst, ALIGNMENT_AVX512);
|
||||
|
||||
constexpr size_t simd_width = 16;
|
||||
constexpr size_t unroll_factor = 4;
|
||||
size_t i = 0;
|
||||
|
||||
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
|
||||
{
|
||||
const auto a0 = _mm512_load_ps(&src1[i]);
|
||||
const auto a1 = _mm512_load_ps(&src1[i + 16]);
|
||||
const auto a2 = _mm512_load_ps(&src1[i + 32]);
|
||||
const auto a3 = _mm512_load_ps(&src1[i + 48]);
|
||||
|
||||
const auto b0 = _mm512_load_ps(&src2[i]);
|
||||
const auto b1 = _mm512_load_ps(&src2[i + 16]);
|
||||
const auto b2 = _mm512_load_ps(&src2[i + 32]);
|
||||
const auto b3 = _mm512_load_ps(&src2[i + 48]);
|
||||
|
||||
const auto result0 = _mm512_add_ps(a0, b0);
|
||||
const auto result1 = _mm512_add_ps(a1, b1);
|
||||
const auto result2 = _mm512_add_ps(a2, b2);
|
||||
const auto result3 = _mm512_add_ps(a3, b3);
|
||||
|
||||
_mm512_store_ps(&dst[i], result0);
|
||||
_mm512_store_ps(&dst[i + 16], result1);
|
||||
_mm512_store_ps(&dst[i + 32], result2);
|
||||
_mm512_store_ps(&dst[i + 48], result3);
|
||||
}
|
||||
|
||||
for (; i + simd_width <= num_samples; i += simd_width)
|
||||
{
|
||||
auto a = _mm512_load_ps(&src1[i]);
|
||||
auto b = _mm512_load_ps(&src2[i]);
|
||||
auto result = _mm512_add_ps(a, b);
|
||||
_mm512_store_ps(&dst[i], result);
|
||||
}
|
||||
|
||||
for (; i < num_samples; ++i)
|
||||
{
|
||||
dst[i] = src1[i] + src2[i];
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_EXPORT void apply_gain(const float *src, float *dst, float gain, size_t num_samples)
|
||||
{
|
||||
ASSERT_ALIGNED(src, ALIGNMENT_AVX512);
|
||||
ASSERT_ALIGNED(dst, ALIGNMENT_AVX512);
|
||||
|
||||
constexpr size_t simd_width = 16;
|
||||
constexpr size_t unroll_factor = 4;
|
||||
size_t i = 0;
|
||||
|
||||
auto gain_vec = _mm512_set1_ps(gain);
|
||||
|
||||
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
|
||||
{
|
||||
auto a0 = _mm512_load_ps(&src[i]);
|
||||
auto a1 = _mm512_load_ps(&src[i + 16]);
|
||||
auto a2 = _mm512_load_ps(&src[i + 32]);
|
||||
auto a3 = _mm512_load_ps(&src[i + 48]);
|
||||
|
||||
auto result0 = _mm512_mul_ps(a0, gain_vec);
|
||||
auto result1 = _mm512_mul_ps(a1, gain_vec);
|
||||
auto result2 = _mm512_mul_ps(a2, gain_vec);
|
||||
auto result3 = _mm512_mul_ps(a3, gain_vec);
|
||||
|
||||
_mm512_store_ps(&dst[i], result0);
|
||||
_mm512_store_ps(&dst[i + 16], result1);
|
||||
_mm512_store_ps(&dst[i + 32], result2);
|
||||
_mm512_store_ps(&dst[i + 48], result3);
|
||||
}
|
||||
|
||||
for (; i + simd_width <= num_samples; i += simd_width)
|
||||
{
|
||||
auto a = _mm512_load_ps(&src[i]);
|
||||
auto result = _mm512_mul_ps(a, gain_vec);
|
||||
_mm512_store_ps(&dst[i], result);
|
||||
}
|
||||
|
||||
for (; i < num_samples; ++i)
|
||||
{
|
||||
dst[i] = src[i] * gain;
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_EXPORT float calculate_rms(const float *src, size_t num_samples)
|
||||
{
|
||||
ASSERT_ALIGNED(src, ALIGNMENT_AVX512);
|
||||
|
||||
constexpr size_t simd_width = 16;
|
||||
constexpr size_t unroll_factor = 4;
|
||||
size_t i = 0;
|
||||
auto sum_squares0 = _mm512_setzero_ps();
|
||||
auto sum_squares1 = _mm512_setzero_ps();
|
||||
auto sum_squares2 = _mm512_setzero_ps();
|
||||
auto sum_squares3 = _mm512_setzero_ps();
|
||||
|
||||
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
|
||||
{
|
||||
const auto a0 = _mm512_load_ps(&src[i]);
|
||||
const auto a1 = _mm512_load_ps(&src[i + 16]);
|
||||
const auto a2 = _mm512_load_ps(&src[i + 32]);
|
||||
const auto a3 = _mm512_load_ps(&src[i + 48]);
|
||||
|
||||
const auto squared0 = _mm512_mul_ps(a0, a0);
|
||||
const auto squared1 = _mm512_mul_ps(a1, a1);
|
||||
const auto squared2 = _mm512_mul_ps(a2, a2);
|
||||
const auto squared3 = _mm512_mul_ps(a3, a3);
|
||||
|
||||
sum_squares0 = _mm512_add_ps(sum_squares0, squared0);
|
||||
sum_squares1 = _mm512_add_ps(sum_squares1, squared1);
|
||||
sum_squares2 = _mm512_add_ps(sum_squares2, squared2);
|
||||
sum_squares3 = _mm512_add_ps(sum_squares3, squared3);
|
||||
}
|
||||
|
||||
auto sum_squares = _mm512_add_ps(_mm512_add_ps(sum_squares0, sum_squares1),
|
||||
_mm512_add_ps(sum_squares2, sum_squares3));
|
||||
|
||||
for (; i + simd_width <= num_samples; i += simd_width)
|
||||
{
|
||||
const auto a = _mm512_load_ps(&src[i]);
|
||||
const auto squared = _mm512_mul_ps(a, a);
|
||||
sum_squares = _mm512_add_ps(sum_squares, squared);
|
||||
}
|
||||
|
||||
double total_sum = _mm512_reduce_add_ps(sum_squares);
|
||||
|
||||
for (; i < num_samples; ++i)
|
||||
{
|
||||
total_sum += static_cast<double>(src[i]) * static_cast<double>(src[i]);
|
||||
}
|
||||
|
||||
return static_cast<float>(std::sqrt(total_sum / static_cast<double>(num_samples)));
|
||||
}
|
||||
|
||||
SIMD_EXPORT float calculate_peak(const float *src, size_t num_samples)
|
||||
{
|
||||
ASSERT_ALIGNED(src, ALIGNMENT_AVX512);
|
||||
|
||||
constexpr size_t simd_width = 16;
|
||||
constexpr size_t unroll_factor = 4;
|
||||
size_t i = 0;
|
||||
auto peak_vec0 = _mm512_setzero_ps();
|
||||
auto peak_vec1 = _mm512_setzero_ps();
|
||||
auto peak_vec2 = _mm512_setzero_ps();
|
||||
auto peak_vec3 = _mm512_setzero_ps();
|
||||
|
||||
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
|
||||
{
|
||||
const auto a0 = _mm512_load_ps(&src[i]);
|
||||
const auto a1 = _mm512_load_ps(&src[i + 16]);
|
||||
const auto a2 = _mm512_load_ps(&src[i + 32]);
|
||||
const auto a3 = _mm512_load_ps(&src[i + 48]);
|
||||
|
||||
const auto abs_a0 = _mm512_andnot_ps(_mm512_set1_ps(-0.0f), a0);
|
||||
const auto abs_a1 = _mm512_andnot_ps(_mm512_set1_ps(-0.0f), a1);
|
||||
const auto abs_a2 = _mm512_andnot_ps(_mm512_set1_ps(-0.0f), a2);
|
||||
const auto abs_a3 = _mm512_andnot_ps(_mm512_set1_ps(-0.0f), a3);
|
||||
|
||||
peak_vec0 = _mm512_max_ps(peak_vec0, abs_a0);
|
||||
peak_vec1 = _mm512_max_ps(peak_vec1, abs_a1);
|
||||
peak_vec2 = _mm512_max_ps(peak_vec2, abs_a2);
|
||||
peak_vec3 = _mm512_max_ps(peak_vec3, abs_a3);
|
||||
}
|
||||
|
||||
auto peak_vec = _mm512_max_ps(_mm512_max_ps(peak_vec0, peak_vec1),
|
||||
_mm512_max_ps(peak_vec2, peak_vec3));
|
||||
|
||||
for (; i + simd_width <= num_samples; i += simd_width)
|
||||
{
|
||||
const auto a = _mm512_load_ps(&src[i]);
|
||||
const auto abs_a = _mm512_andnot_ps(_mm512_set1_ps(-0.0f), a);
|
||||
peak_vec = _mm512_max_ps(peak_vec, abs_a);
|
||||
}
|
||||
|
||||
float peak = _mm512_reduce_max_ps(peak_vec);
|
||||
|
||||
for (; i < num_samples; ++i)
|
||||
{
|
||||
float abs_sample = std::fabs(src[i]);
|
||||
if (abs_sample > peak)
|
||||
{
|
||||
peak = abs_sample;
|
||||
}
|
||||
}
|
||||
|
||||
return peak;
|
||||
}
|
||||
|
||||
SIMD_EXPORT void normalize_audio(const float *src, float *dst, float target_peak, size_t num_samples)
|
||||
{
|
||||
ASSERT_ALIGNED(src, ALIGNMENT_AVX512);
|
||||
ASSERT_ALIGNED(dst, ALIGNMENT_AVX512);
|
||||
|
||||
if (num_samples == 0 || target_peak <= 0.0f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
const float current_peak = calculate_peak(src, num_samples);
|
||||
|
||||
if (current_peak < 1e-10f)
|
||||
{
|
||||
constexpr size_t simd_width = 16;
|
||||
constexpr size_t unroll_factor = 4;
|
||||
auto zero_vec = _mm512_setzero_ps();
|
||||
size_t i = 0;
|
||||
|
||||
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
|
||||
{
|
||||
_mm512_store_ps(&dst[i], zero_vec);
|
||||
_mm512_store_ps(&dst[i + 16], zero_vec);
|
||||
_mm512_store_ps(&dst[i + 32], zero_vec);
|
||||
_mm512_store_ps(&dst[i + 48], zero_vec);
|
||||
}
|
||||
|
||||
for (; i + simd_width <= num_samples; i += simd_width)
|
||||
{
|
||||
_mm512_store_ps(&dst[i], zero_vec);
|
||||
}
|
||||
|
||||
for (; i < num_samples; ++i)
|
||||
{
|
||||
dst[i] = 0.0f;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
const float gain_factor = target_peak / current_peak;
|
||||
apply_gain(src, dst, gain_factor, num_samples);
|
||||
}
|
||||
|
||||
SIMD_EXPORT void stereo_to_mono(const float *stereo_src, float *mono_dst, size_t num_stereo_samples)
|
||||
{
|
||||
ASSERT_ALIGNED(stereo_src, ALIGNMENT_AVX512);
|
||||
ASSERT_ALIGNED(mono_dst, ALIGNMENT_AVX512);
|
||||
|
||||
if (num_stereo_samples == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr size_t simd_width = 16;
|
||||
constexpr size_t unroll_factor = 4;
|
||||
const auto half_vec = _mm512_set1_ps(0.5f);
|
||||
size_t stereo_idx = 0;
|
||||
size_t mono_idx = 0;
|
||||
|
||||
for (; stereo_idx + simd_width * 2 * unroll_factor <= num_stereo_samples * 2;
|
||||
stereo_idx += simd_width * 2 * unroll_factor, mono_idx += simd_width * unroll_factor)
|
||||
{
|
||||
auto stereo0 = _mm512_load_ps(&stereo_src[stereo_idx]);
|
||||
auto stereo1 = _mm512_load_ps(&stereo_src[stereo_idx + 16]);
|
||||
auto stereo2 = _mm512_load_ps(&stereo_src[stereo_idx + 32]);
|
||||
auto stereo3 = _mm512_load_ps(&stereo_src[stereo_idx + 48]);
|
||||
auto stereo4 = _mm512_load_ps(&stereo_src[stereo_idx + 64]);
|
||||
auto stereo5 = _mm512_load_ps(&stereo_src[stereo_idx + 80]);
|
||||
auto stereo6 = _mm512_load_ps(&stereo_src[stereo_idx + 96]);
|
||||
auto stereo7 = _mm512_load_ps(&stereo_src[stereo_idx + 112]);
|
||||
|
||||
const auto even_mask = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
|
||||
const auto odd_mask = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
|
||||
|
||||
auto left0 = _mm512_permutex2var_ps(stereo0, even_mask, stereo1);
|
||||
auto right0 = _mm512_permutex2var_ps(stereo0, odd_mask, stereo1);
|
||||
auto left1 = _mm512_permutex2var_ps(stereo2, even_mask, stereo3);
|
||||
auto right1 = _mm512_permutex2var_ps(stereo2, odd_mask, stereo3);
|
||||
auto left2 = _mm512_permutex2var_ps(stereo4, even_mask, stereo5);
|
||||
auto right2 = _mm512_permutex2var_ps(stereo4, odd_mask, stereo5);
|
||||
auto left3 = _mm512_permutex2var_ps(stereo6, even_mask, stereo7);
|
||||
auto right3 = _mm512_permutex2var_ps(stereo6, odd_mask, stereo7);
|
||||
|
||||
auto mono0 = _mm512_mul_ps(_mm512_add_ps(left0, right0), half_vec);
|
||||
auto mono1 = _mm512_mul_ps(_mm512_add_ps(left1, right1), half_vec);
|
||||
auto mono2 = _mm512_mul_ps(_mm512_add_ps(left2, right2), half_vec);
|
||||
auto mono3 = _mm512_mul_ps(_mm512_add_ps(left3, right3), half_vec);
|
||||
|
||||
_mm512_store_ps(&mono_dst[mono_idx], mono0);
|
||||
_mm512_store_ps(&mono_dst[mono_idx + 16], mono1);
|
||||
_mm512_store_ps(&mono_dst[mono_idx + 32], mono2);
|
||||
_mm512_store_ps(&mono_dst[mono_idx + 48], mono3);
|
||||
}
|
||||
|
||||
for (size_t i = stereo_idx / 2; i < num_stereo_samples; ++i)
|
||||
{
|
||||
const float left = stereo_src[i * 2];
|
||||
const float right = stereo_src[i * 2 + 1];
|
||||
mono_dst[i] = (left + right) * 0.5f;
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_EXPORT void limit_audio(const float *src, float *dst, float threshold, float *limiter_state, float sample_rate,
|
||||
size_t num_samples)
|
||||
{
|
||||
ASSERT_ALIGNED(src, ALIGNMENT_AVX512);
|
||||
ASSERT_ALIGNED(dst, ALIGNMENT_AVX512);
|
||||
|
||||
if (num_samples == 0 || threshold <= 0.0f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr size_t simd_width = 16;
|
||||
constexpr size_t unroll_factor = 4;
|
||||
constexpr float release_time = 0.05f;
|
||||
float release_coeff = std::exp(-1.0f / (release_time * sample_rate));
|
||||
|
||||
float current_gain = limiter_state != nullptr ? *limiter_state : 1.0f;
|
||||
|
||||
size_t i = 0;
|
||||
|
||||
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
|
||||
{
|
||||
auto a0 = _mm512_load_ps(&src[i]);
|
||||
auto a1 = _mm512_load_ps(&src[i + 16]);
|
||||
auto a2 = _mm512_load_ps(&src[i + 32]);
|
||||
auto a3 = _mm512_load_ps(&src[i + 48]);
|
||||
|
||||
auto abs_a0 = _mm512_abs_ps(a0);
|
||||
auto abs_a1 = _mm512_abs_ps(a1);
|
||||
auto abs_a2 = _mm512_abs_ps(a2);
|
||||
auto abs_a3 = _mm512_abs_ps(a3);
|
||||
|
||||
auto max_abs = _mm512_max_ps(_mm512_max_ps(abs_a0, abs_a1),
|
||||
_mm512_max_ps(abs_a2, abs_a3));
|
||||
|
||||
float max_sample = _mm512_reduce_max_ps(max_abs);
|
||||
|
||||
float target_gain = max_sample > threshold ? threshold / max_sample : 1.0f;
|
||||
|
||||
if (target_gain < current_gain)
|
||||
{
|
||||
current_gain = target_gain;
|
||||
}
|
||||
else
|
||||
{
|
||||
current_gain = target_gain + (current_gain - target_gain) * release_coeff;
|
||||
}
|
||||
|
||||
auto gain_vec = _mm512_set1_ps(current_gain);
|
||||
|
||||
auto result0 = _mm512_mul_ps(a0, gain_vec);
|
||||
auto result1 = _mm512_mul_ps(a1, gain_vec);
|
||||
auto result2 = _mm512_mul_ps(a2, gain_vec);
|
||||
auto result3 = _mm512_mul_ps(a3, gain_vec);
|
||||
|
||||
_mm512_store_ps(&dst[i], result0);
|
||||
_mm512_store_ps(&dst[i + 16], result1);
|
||||
_mm512_store_ps(&dst[i + 32], result2);
|
||||
_mm512_store_ps(&dst[i + 48], result3);
|
||||
}
|
||||
|
||||
for (; i + simd_width <= num_samples; i += simd_width)
|
||||
{
|
||||
auto a = _mm512_load_ps(&src[i]);
|
||||
auto abs_a = _mm512_abs_ps(a);
|
||||
|
||||
float max_sample = _mm512_reduce_max_ps(abs_a);
|
||||
|
||||
float target_gain = max_sample > threshold ? threshold / max_sample : 1.0f;
|
||||
|
||||
if (target_gain < current_gain)
|
||||
{
|
||||
current_gain = target_gain;
|
||||
}
|
||||
else
|
||||
{
|
||||
current_gain = target_gain + (current_gain - target_gain) * release_coeff;
|
||||
}
|
||||
|
||||
auto gain_vec = _mm512_set1_ps(current_gain);
|
||||
auto result = _mm512_mul_ps(a, gain_vec);
|
||||
_mm512_store_ps(&dst[i], result);
|
||||
}
|
||||
|
||||
for (; i < num_samples; ++i)
|
||||
{
|
||||
float sample = src[i];
|
||||
float abs_sample = std::fabs(sample);
|
||||
|
||||
float target_gain = abs_sample > threshold ? threshold / abs_sample : 1.0f;
|
||||
|
||||
if (target_gain < current_gain)
|
||||
{
|
||||
current_gain = target_gain;
|
||||
}
|
||||
else
|
||||
{
|
||||
current_gain = target_gain + (current_gain - target_gain) * release_coeff;
|
||||
}
|
||||
|
||||
dst[i] = sample * current_gain;
|
||||
}
|
||||
|
||||
if (limiter_state != nullptr)
|
||||
{
|
||||
*limiter_state = current_gain;
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_EXPORT void fade_audio(const float *src, float *dst, size_t fade_in_samples, size_t fade_out_samples,
|
||||
size_t num_samples)
|
||||
{
|
||||
ASSERT_ALIGNED(src, ALIGNMENT_AVX512);
|
||||
ASSERT_ALIGNED(dst, ALIGNMENT_AVX512);
|
||||
|
||||
if (num_samples == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr size_t simd_width = 16;
|
||||
constexpr size_t unroll_factor = 4;
|
||||
size_t i = 0;
|
||||
|
||||
if (fade_in_samples > 0)
|
||||
{
|
||||
const float fade_in_step = 1.0f / static_cast<float>(fade_in_samples);
|
||||
|
||||
for (; i + simd_width * unroll_factor <= std::min(fade_in_samples, num_samples); i += simd_width *
|
||||
unroll_factor)
|
||||
{
|
||||
auto gain0 = _mm512_set_ps((i + 15) * fade_in_step, (i + 14) * fade_in_step, (i + 13) * fade_in_step,
|
||||
(i + 12) * fade_in_step,
|
||||
(i + 11) * fade_in_step, (i + 10) * fade_in_step, (i + 9) * fade_in_step,
|
||||
(i + 8) * fade_in_step,
|
||||
(i + 7) * fade_in_step, (i + 6) * fade_in_step, (i + 5) * fade_in_step,
|
||||
(i + 4) * fade_in_step,
|
||||
(i + 3) * fade_in_step, (i + 2) * fade_in_step, (i + 1) * fade_in_step,
|
||||
i * fade_in_step);
|
||||
auto gain1 = _mm512_set_ps((i + 31) * fade_in_step, (i + 30) * fade_in_step, (i + 29) * fade_in_step,
|
||||
(i + 28) * fade_in_step,
|
||||
(i + 27) * fade_in_step, (i + 26) * fade_in_step, (i + 25) * fade_in_step,
|
||||
(i + 24) * fade_in_step,
|
||||
(i + 23) * fade_in_step, (i + 22) * fade_in_step, (i + 21) * fade_in_step,
|
||||
(i + 20) * fade_in_step,
|
||||
(i + 19) * fade_in_step, (i + 18) * fade_in_step, (i + 17) * fade_in_step,
|
||||
(i + 16) * fade_in_step);
|
||||
auto gain2 = _mm512_set_ps((i + 47) * fade_in_step, (i + 46) * fade_in_step, (i + 45) * fade_in_step,
|
||||
(i + 44) * fade_in_step,
|
||||
(i + 43) * fade_in_step, (i + 42) * fade_in_step, (i + 41) * fade_in_step,
|
||||
(i + 40) * fade_in_step,
|
||||
(i + 39) * fade_in_step, (i + 38) * fade_in_step, (i + 37) * fade_in_step,
|
||||
(i + 36) * fade_in_step,
|
||||
(i + 35) * fade_in_step, (i + 34) * fade_in_step, (i + 33) * fade_in_step,
|
||||
(i + 32) * fade_in_step);
|
||||
auto gain3 = _mm512_set_ps((i + 63) * fade_in_step, (i + 62) * fade_in_step, (i + 61) * fade_in_step,
|
||||
(i + 60) * fade_in_step,
|
||||
(i + 59) * fade_in_step, (i + 58) * fade_in_step, (i + 57) * fade_in_step,
|
||||
(i + 56) * fade_in_step,
|
||||
(i + 55) * fade_in_step, (i + 54) * fade_in_step, (i + 53) * fade_in_step,
|
||||
(i + 52) * fade_in_step,
|
||||
(i + 51) * fade_in_step, (i + 50) * fade_in_step, (i + 49) * fade_in_step,
|
||||
(i + 48) * fade_in_step);
|
||||
|
||||
auto a0 = _mm512_load_ps(&src[i]);
|
||||
auto a1 = _mm512_load_ps(&src[i + 16]);
|
||||
auto a2 = _mm512_load_ps(&src[i + 32]);
|
||||
auto a3 = _mm512_load_ps(&src[i + 48]);
|
||||
|
||||
auto result0 = _mm512_mul_ps(a0, gain0);
|
||||
auto result1 = _mm512_mul_ps(a1, gain1);
|
||||
auto result2 = _mm512_mul_ps(a2, gain2);
|
||||
auto result3 = _mm512_mul_ps(a3, gain3);
|
||||
|
||||
_mm512_store_ps(&dst[i], result0);
|
||||
_mm512_store_ps(&dst[i + 16], result1);
|
||||
_mm512_store_ps(&dst[i + 32], result2);
|
||||
_mm512_store_ps(&dst[i + 48], result3);
|
||||
}
|
||||
|
||||
for (; i < std::min(fade_in_samples, num_samples); ++i)
|
||||
{
|
||||
const float gain = static_cast<float>(i) / static_cast<float>(fade_in_samples);
|
||||
dst[i] = src[i] * gain;
|
||||
}
|
||||
}
|
||||
|
||||
const size_t middle_start = fade_in_samples;
|
||||
const size_t middle_end = num_samples > fade_out_samples ? num_samples - fade_out_samples : 0;
|
||||
|
||||
if (middle_end > middle_start)
|
||||
{
|
||||
for (size_t j = middle_start; j + simd_width * unroll_factor <= middle_end; j += simd_width *
|
||||
unroll_factor)
|
||||
{
|
||||
auto a0 = _mm512_load_ps(&src[j]);
|
||||
auto a1 = _mm512_load_ps(&src[j + 16]);
|
||||
auto a2 = _mm512_load_ps(&src[j + 32]);
|
||||
auto a3 = _mm512_load_ps(&src[j + 48]);
|
||||
|
||||
_mm512_store_ps(&dst[j], a0);
|
||||
_mm512_store_ps(&dst[j + 16], a1);
|
||||
_mm512_store_ps(&dst[j + 32], a2);
|
||||
_mm512_store_ps(&dst[j + 48], a3);
|
||||
}
|
||||
|
||||
for (size_t j = middle_start + ((middle_end - middle_start) / (simd_width * unroll_factor)) * (simd_width *
|
||||
unroll_factor);
|
||||
j < middle_end; ++j)
|
||||
{
|
||||
dst[j] = src[j];
|
||||
}
|
||||
}
|
||||
|
||||
if (fade_out_samples > 0 && num_samples > fade_out_samples)
|
||||
{
|
||||
const size_t fade_out_start = num_samples - fade_out_samples;
|
||||
const float fade_out_step = 1.0f / static_cast<float>(fade_out_samples);
|
||||
|
||||
for (size_t j = fade_out_start; j + simd_width * unroll_factor <= num_samples; j += simd_width *
|
||||
unroll_factor)
|
||||
{
|
||||
const size_t fade_out_offset = j - fade_out_start;
|
||||
auto gain0 = _mm512_set_ps(1.0f - (fade_out_offset + 15) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 14) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 13) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 12) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 11) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 10) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 9) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 8) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 7) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 6) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 5) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 4) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 3) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 2) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 1) * fade_out_step,
|
||||
1.0f - fade_out_offset * fade_out_step);
|
||||
auto gain1 = _mm512_set_ps(1.0f - (fade_out_offset + 31) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 30) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 29) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 28) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 27) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 26) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 25) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 24) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 23) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 22) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 21) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 20) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 19) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 18) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 17) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 16) * fade_out_step);
|
||||
auto gain2 = _mm512_set_ps(1.0f - (fade_out_offset + 47) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 46) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 45) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 44) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 43) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 42) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 41) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 40) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 39) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 38) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 37) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 36) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 35) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 34) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 33) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 32) * fade_out_step);
|
||||
auto gain3 = _mm512_set_ps(1.0f - (fade_out_offset + 63) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 62) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 61) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 60) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 59) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 58) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 57) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 56) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 55) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 54) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 53) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 52) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 51) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 50) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 49) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 48) * fade_out_step);
|
||||
|
||||
auto a0 = _mm512_load_ps(&src[j]);
|
||||
auto a1 = _mm512_load_ps(&src[j + 16]);
|
||||
auto a2 = _mm512_load_ps(&src[j + 32]);
|
||||
auto a3 = _mm512_load_ps(&src[j + 48]);
|
||||
|
||||
auto result0 = _mm512_mul_ps(a0, gain0);
|
||||
auto result1 = _mm512_mul_ps(a1, gain1);
|
||||
auto result2 = _mm512_mul_ps(a2, gain2);
|
||||
auto result3 = _mm512_mul_ps(a3, gain3);
|
||||
|
||||
_mm512_store_ps(&dst[j], result0);
|
||||
_mm512_store_ps(&dst[j + 16], result1);
|
||||
_mm512_store_ps(&dst[j + 32], result2);
|
||||
_mm512_store_ps(&dst[j + 48], result3);
|
||||
}
|
||||
|
||||
for (size_t j = fade_out_start + ((fade_out_samples / (simd_width * unroll_factor)) * (simd_width *
|
||||
unroll_factor));
|
||||
j < num_samples; ++j)
|
||||
{
|
||||
const size_t fade_out_offset = j - fade_out_start;
|
||||
const float gain = 1.0f - static_cast<float>(fade_out_offset) / static_cast<float>(fade_out_samples);
|
||||
dst[j] = src[j] * gain;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_EXPORT void simple_eq(const float *src, float *dst, float low_gain, float mid_gain, float high_gain,
|
||||
float *eq_state, size_t num_samples)
|
||||
{
|
||||
ASSERT_ALIGNED(src, ALIGNMENT_AVX512);
|
||||
ASSERT_ALIGNED(dst, ALIGNMENT_AVX512);
|
||||
|
||||
if (num_samples == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr size_t simd_width = 16;
|
||||
constexpr size_t unroll_factor = 4;
|
||||
|
||||
constexpr float low_cutoff = 0.02f;
|
||||
constexpr float high_cutoff = 0.1f;
|
||||
constexpr float mid_factor = 0.7f;
|
||||
|
||||
float low_state = eq_state != nullptr ? *eq_state : 0.0f;
|
||||
float high_state = eq_state != nullptr ? *(eq_state + 1) : 0.0f;
|
||||
|
||||
const auto low_gain_vec = _mm512_set1_ps(low_gain);
|
||||
const auto mid_gain_vec = _mm512_set1_ps(mid_gain);
|
||||
const auto high_gain_vec = _mm512_set1_ps(high_gain);
|
||||
const auto low_cutoff_vec = _mm512_set1_ps(low_cutoff);
|
||||
const auto high_cutoff_vec = _mm512_set1_ps(high_cutoff);
|
||||
const auto mid_factor_vec = _mm512_set1_ps(mid_factor);
|
||||
const auto one_minus_low_cutoff_vec = _mm512_set1_ps(1.0f - low_cutoff);
|
||||
const auto one_minus_high_cutoff_vec = _mm512_set1_ps(1.0f - high_cutoff);
|
||||
|
||||
size_t i = 0;
|
||||
|
||||
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
|
||||
{
|
||||
auto input0 = _mm512_load_ps(&src[i]);
|
||||
auto input1 = _mm512_load_ps(&src[i + 16]);
|
||||
auto input2 = _mm512_load_ps(&src[i + 32]);
|
||||
auto input3 = _mm512_load_ps(&src[i + 48]);
|
||||
|
||||
auto low_state_vec = _mm512_set1_ps(low_state);
|
||||
auto low0 = _mm512_fmadd_ps(input0, low_cutoff_vec, _mm512_mul_ps(low_state_vec, one_minus_low_cutoff_vec));
|
||||
auto low1 = _mm512_fmadd_ps(input1, low_cutoff_vec, _mm512_mul_ps(low0, one_minus_low_cutoff_vec));
|
||||
auto low2 = _mm512_fmadd_ps(input2, low_cutoff_vec, _mm512_mul_ps(low1, one_minus_low_cutoff_vec));
|
||||
auto low3 = _mm512_fmadd_ps(input3, low_cutoff_vec, _mm512_mul_ps(low2, one_minus_low_cutoff_vec));
|
||||
|
||||
auto high0 = _mm512_sub_ps(input0, low0);
|
||||
auto high1 = _mm512_sub_ps(input1, low1);
|
||||
auto high2 = _mm512_sub_ps(input2, low2);
|
||||
auto high3 = _mm512_sub_ps(input3, low3);
|
||||
|
||||
auto high_state_vec = _mm512_set1_ps(high_state);
|
||||
high0 = _mm512_fmadd_ps(high0, high_cutoff_vec, _mm512_mul_ps(high_state_vec, one_minus_high_cutoff_vec));
|
||||
high1 = _mm512_fmadd_ps(high1, high_cutoff_vec, _mm512_mul_ps(high0, one_minus_high_cutoff_vec));
|
||||
high2 = _mm512_fmadd_ps(high2, high_cutoff_vec, _mm512_mul_ps(high1, one_minus_high_cutoff_vec));
|
||||
high3 = _mm512_fmadd_ps(high3, high_cutoff_vec, _mm512_mul_ps(high2, one_minus_high_cutoff_vec));
|
||||
|
||||
auto mid0 = _mm512_mul_ps(_mm512_sub_ps(_mm512_sub_ps(input0, low0), high0), mid_factor_vec);
|
||||
auto mid1 = _mm512_mul_ps(_mm512_sub_ps(_mm512_sub_ps(input1, low1), high1), mid_factor_vec);
|
||||
auto mid2 = _mm512_mul_ps(_mm512_sub_ps(_mm512_sub_ps(input2, low2), high2), mid_factor_vec);
|
||||
auto mid3 = _mm512_mul_ps(_mm512_sub_ps(_mm512_sub_ps(input3, low3), high3), mid_factor_vec);
|
||||
|
||||
auto result0 = _mm512_fmadd_ps(low0, low_gain_vec,
|
||||
_mm512_fmadd_ps(mid0, mid_gain_vec, _mm512_mul_ps(high0, high_gain_vec)));
|
||||
auto result1 = _mm512_fmadd_ps(low1, low_gain_vec,
|
||||
_mm512_fmadd_ps(mid1, mid_gain_vec, _mm512_mul_ps(high1, high_gain_vec)));
|
||||
auto result2 = _mm512_fmadd_ps(low2, low_gain_vec,
|
||||
_mm512_fmadd_ps(mid2, mid_gain_vec, _mm512_mul_ps(high2, high_gain_vec)));
|
||||
auto result3 = _mm512_fmadd_ps(low3, low_gain_vec,
|
||||
_mm512_fmadd_ps(mid3, mid_gain_vec, _mm512_mul_ps(high3, high_gain_vec)));
|
||||
|
||||
_mm512_store_ps(&dst[i], result0);
|
||||
_mm512_store_ps(&dst[i + 16], result1);
|
||||
_mm512_store_ps(&dst[i + 32], result2);
|
||||
_mm512_store_ps(&dst[i + 48], result3);
|
||||
|
||||
__m128 low_temp = _mm512_extractf32x4_ps(low3, 3);
|
||||
low_state = _mm_cvtss_f32(low_temp);
|
||||
__m128 high_temp = _mm512_extractf32x4_ps(high3, 3);
|
||||
high_state = _mm_cvtss_f32(high_temp);
|
||||
}
|
||||
|
||||
for (; i < num_samples; ++i)
|
||||
{
|
||||
float input = src[i];
|
||||
|
||||
float low_output = low_cutoff * input + (1.0f - low_cutoff) * low_state;
|
||||
low_state = low_output;
|
||||
|
||||
float high_input = input - low_output;
|
||||
float high_output = high_cutoff * high_input + (1.0f - high_cutoff) * high_state;
|
||||
high_state = high_output;
|
||||
|
||||
float mid_output = (input - low_output - high_output) * mid_factor;
|
||||
|
||||
dst[i] = low_output * low_gain + mid_output * mid_gain + high_output * high_gain;
|
||||
}
|
||||
|
||||
if (eq_state != nullptr)
|
||||
{
|
||||
*eq_state = low_state;
|
||||
*(eq_state + 1) = high_state;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,428 +0,0 @@
|
||||
/**
|
||||
* @file simd_func_dispatcher.cpp
|
||||
* @brief SIMD函数调度器实 - 运行时函数分发的核心实现
|
||||
*
|
||||
* 本文件实现了SIMD数调度器的核心功能,包括:
|
||||
* - 函数注册表状态查询和调试输出
|
||||
* - 版本枚举与字符串之间的双向转换
|
||||
* - 运行时函数版本选择的辅助功能
|
||||
*
|
||||
* 核心实现功能:
|
||||
* ============================================================================
|
||||
* 1. print_registry_status() - 调试诊断功
|
||||
* - 遍历并输出所有已注册的函数及其可用版本
|
||||
* - 用于运行时验证函数注册是否正确
|
||||
* - 帮助开发者了解当前可用的SIMD优化函数
|
||||
*
|
||||
* 2. simd_func_version_to_string() - 版本到字符串转换
|
||||
* - 将枚举值转换为可读的字符串表示
|
||||
* - 用于日志输出、调试信息和用户界面显示
|
||||
* - 采用switch-case实现确保编译时类型安全
|
||||
*
|
||||
* 3. string_to_simd_func_version() - 字符串到版本转换
|
||||
* - 将字符串解析为版本枚举值
|
||||
* - 用于配置文件解析、命令行参数处理
|
||||
* - 提供回退机制:无法识别时返回SCALAR版本
|
||||
*
|
||||
* 实现特点:
|
||||
* ============================================================================
|
||||
* - 简洁明了:实现直观,易于维护和扩展
|
||||
* - 类型安全:使用强类型枚举,避免魔法数字
|
||||
* - 完整映射:覆盖所有定义的SIMD版本
|
||||
* - 健壮性:处理未知版本的边界情况
|
||||
* - 零依赖:仅依赖标准库和项目头文件
|
||||
*
|
||||
* 性能考虑:
|
||||
* ============================================================================
|
||||
* - 版本转换函数使用简单的条件判断,性能开销可忽略
|
||||
* - print_registry_status() 仅用于调试,不在性能关键路径上
|
||||
* - 字符串比较采用高效的std::string相等性判断
|
||||
*
|
||||
* 设计模式:
|
||||
* ============================================================================
|
||||
* - 该文件是simd_func_dispatcher类的实现部分
|
||||
* - 采用了接口与实现分离的设计
|
||||
* - 头文件定义接口和模板实现,cpp文件实现非模板函数
|
||||
*
|
||||
* @note 这些函数主要用于调试、日志和配置解析,不在性能关键路径上
|
||||
* @see simd_func_dispatcher.h 查看完整的类定义和接口说明
|
||||
*/
|
||||
|
||||
#include "simd_func_dispatcher.h"
|
||||
#include <cstdio>
|
||||
|
||||
/**
|
||||
* @brief 打印函数注册表状态 - 调试和诊断工具
|
||||
*
|
||||
* 该函数遍历并打印所有已注册到调度器的SIMD函数及其可用版本,
|
||||
* 是一个重要的调试和诊断工具。
|
||||
*
|
||||
* 功能详述:
|
||||
* ============================================================================
|
||||
* 1. 遍历函数注册表
|
||||
* - 访问func_registry_中每个函数名和持有者对
|
||||
* - 使用基于范围的for循环提高代码可读性
|
||||
*
|
||||
* 2. 获取版本信息
|
||||
* - 调用holder的has_implementation()检查是否有实现
|
||||
* - 如有实现,调用get_available_versions()获取所有已注册版本
|
||||
*
|
||||
* 3. 格式化输出
|
||||
* - 函数名:清晰标识当前函数
|
||||
* - 版本列表:展示所有可用的SIMD优化版本
|
||||
* - 未实现提示:明确标识未注册的函数
|
||||
*
|
||||
* 输出格式示例:
|
||||
* ============================================================================
|
||||
* ```
|
||||
* Registered SIMD Functions:
|
||||
* Function: process_audio
|
||||
* Available Versions: SCALAR SSE4 AVX2
|
||||
* Function: mix_channels
|
||||
* Available Versions: SCALAR NEON
|
||||
* Function: apply_gain
|
||||
* No implementations registered.
|
||||
* ```
|
||||
*
|
||||
* 使用场景:
|
||||
* ============================================================================
|
||||
* - 程序启动时验证函数注册是否正确
|
||||
* - 调试时检查特定函数的可用版本
|
||||
* - 性能分析时了解当前使用的优化级别
|
||||
* - 单元测试中验证注册逻辑
|
||||
*
|
||||
* 性能考虑:
|
||||
* ============================================================================
|
||||
* - 该函数仅用于调试,不应在性能关键代码中频繁调用
|
||||
* - 使用printf而非std::cout以减少头文件依赖和提高输出性能
|
||||
* - 遍历整个注册表的时间复杂度为O(n*m),n为函数数量,m为平均版本数
|
||||
*
|
||||
* @note 这是一个const成员数,不会修改调度器状态
|
||||
* @note 输出直接发送到stdout,适合命令行程序使用
|
||||
*/
|
||||
void simd_func_dispatcher::print_registry_status() const {
|
||||
// 打印标题,标识输出内容
|
||||
printf("Registered SIMD Functions:\n");
|
||||
|
||||
// 遍历函数注册表中的所有条目
|
||||
// pair.first: 函数名称(std::string)
|
||||
// pair.second: 函数持有者的unique_ptr(func_holder_base*)
|
||||
for (const auto& pair : func_registry_) {
|
||||
const auto& func_name = pair.first; // 获取函数名称
|
||||
const auto& holder = pair.second; // 获取函数持有者
|
||||
|
||||
// 输出当前函数名称
|
||||
printf("Function: %s\n", func_name.c_str());
|
||||
|
||||
// 检查该函数是否有任何版本的实现
|
||||
if (holder->has_implementation()) {
|
||||
// 获取所有已注册的版本列表
|
||||
auto versions = holder->get_available_versions();
|
||||
|
||||
// 输出版本列表的标题
|
||||
printf(" Available Versions: ");
|
||||
|
||||
// 遍历并输出每个可用版本
|
||||
// 使用switch-case将枚值转换为可读的字符串
|
||||
for (const auto& version : versions) {
|
||||
switch (version) {
|
||||
// x86/x64架构的SIMD版
|
||||
case simd_func_version::SCALAR:
|
||||
printf("SCALAR "); // 标量版本(无SIMD化)
|
||||
break;
|
||||
case simd_func_version::SSE:
|
||||
printf("SSE "); // SSE/SSE2版本(128位量)
|
||||
break;
|
||||
case simd_func_version::SSE3:
|
||||
printf("SSE3 "); // SSE3/SSSE3版本
|
||||
break;
|
||||
case simd_func_version::SSE4:
|
||||
printf("SSE4 "); // SSE4.1/SSE4.2版本
|
||||
break;
|
||||
case simd_func_version::AVX:
|
||||
printf("AVX "); // AVX版本(256位向量)
|
||||
break;
|
||||
case simd_func_version::AVX2:
|
||||
printf("AVX2 "); // AVX2+FMA版本
|
||||
break;
|
||||
case simd_func_version::AVX512:
|
||||
printf("AVX512 "); // AVX-512版本(512位量)
|
||||
break;
|
||||
|
||||
// ARM架构的SIMD版本
|
||||
case simd_func_version::NEON:
|
||||
printf("NEON "); // ARM NEON版本
|
||||
break;
|
||||
case simd_func_version::NEON_FP16:
|
||||
printf("NEON_FP16 "); // NEON半精度浮点版
|
||||
break;
|
||||
|
||||
// RISC-V架构的向量扩展
|
||||
case simd_func_version::VECTOR:
|
||||
printf("VECTOR "); // RISC-V向量扩展
|
||||
break;
|
||||
|
||||
// 处理未知版本(理论上不应出现)
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
// 版本列表输出完毕,换行
|
||||
printf("\n");
|
||||
}
|
||||
else {
|
||||
// 该函数尚未注册任何实现版本
|
||||
printf(" No implementations registered.\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief SIMD版本枚举转字串 - 将版本枚举值转换为可读字符串
|
||||
* @param version SIMD函数版本枚举
|
||||
* @return 对应的版本名称字符串(C风格字符串)
|
||||
*
|
||||
* 该函数提供版本枚举到字符串的标准转换,用于:
|
||||
* - 日志输出和调试信息
|
||||
* - 用户界面显示
|
||||
* - 配置文件生成
|
||||
* - 错误消息构造
|
||||
*
|
||||
* 实现策略:
|
||||
* ============================================================================
|
||||
* 1. 使用switch-case实现完全映射
|
||||
* - 编译器可以检测遗漏的case分支
|
||||
* - 保证类型安全,避免隐式转换
|
||||
* - 性能优秀,通常编译为跳转表
|
||||
*
|
||||
* 2. 返回C字符串而非std::string
|
||||
* - 避免内存分配开销
|
||||
* - 字符串字面量存储在只读数据段
|
||||
* - 生命周期为整个程序运行期
|
||||
*
|
||||
* 3. 提供默认处理
|
||||
* - 对于未识别的枚举值返回"UNKNOWN"
|
||||
* - 提高代码健壮性,防止未定义行为
|
||||
*
|
||||
* 映射关系:
|
||||
* ============================================================================
|
||||
* | 枚举值 | 返回字符串 | 说明 |
|
||||
* |-------------------|--------------|------------------------|
|
||||
* | SCALAR | "SCALAR" | 标量实现 |
|
||||
* | SSE | "SSE" | SSE/SSE2指令集 |
|
||||
* | SSE3 | "SSE3" | SSE3/SSSE3指令集 |
|
||||
* | SSE4 | "SSE4" | SSE4.1/4.2指令集 |
|
||||
* | AVX | "AVX" | AVX指令集 |
|
||||
* | AVX2 | "AVX2" | AVX2+FMA指令集 |
|
||||
* | AVX512 | "AVX512" | AVX-512指令集 |
|
||||
* | NEON | "NEON" | ARM NEON指令集 |
|
||||
* | NEON_FP16 | "NEON_FP16" | NEON半精度浮点 |
|
||||
* | VECTOR | "VECTOR" | RISC-V向量扩展 |
|
||||
* | 其他 | "UNKNOWN" | 未知或非法值 |
|
||||
*
|
||||
* 使用示例:
|
||||
* ============================================================================
|
||||
* @code
|
||||
* // 日志输出
|
||||
* const char* name = simd_func_version_to_string(simd_func_version::AVX2);
|
||||
* logger->info("Using SIMD version: {}", name); // 输出: Using SIMD version: AVX2
|
||||
*
|
||||
* // 调试信息
|
||||
* printf("Current version: %s\n", simd_func_version_to_string(current_version));
|
||||
*
|
||||
* // 配置文件生成
|
||||
* config_file << "preferred_version=" << simd_func_version_to_string(preferred) << "\n";
|
||||
* @endcode
|
||||
*
|
||||
* 性能特性:
|
||||
* ============================================================================
|
||||
* - 时间复杂度:O(1) - 编译器通常优化为跳转表或二分查找
|
||||
* - 空间复杂度:O(1) - 字符串字面量存储在只读数据段
|
||||
* - 无内存分配:返回静态字符串,无运行时开销
|
||||
* - 线程安全:只读操作,无共享状态修改
|
||||
*
|
||||
* @note 返回的字符串为静态存储,调用者不应修改或释放
|
||||
* @note constexpr修饰符未使用是因为字符串字面量返回类型的限制
|
||||
* @see string_to_simd_func_version() 执行反向换
|
||||
*/
|
||||
const char* simd_func_version_to_string(simd_func_version version) {
|
||||
// 使用switch-case实现全映射
|
||||
// 编译器会检测是否遗漏case分支(如果使用-Wswitch警告)
|
||||
switch (version) {
|
||||
// 标量版本 - 基础实现,所有平台都支持
|
||||
case simd_func_version::SCALAR:
|
||||
return "SCALAR";
|
||||
|
||||
// x86/x64 SIMD指令集版本(发展顺序)
|
||||
case simd_func_version::SSE:
|
||||
return "SSE"; // 2003年:Pentium 4引入,128位向量
|
||||
case simd_func_version::SSE3:
|
||||
return "SSE3"; // 2006年:Core微架构增强浮点运算
|
||||
case simd_func_version::SSE4:
|
||||
return "SSE4"; // 2008年:Nehalem微架,增强整数和字符串处理
|
||||
case simd_func_version::AVX:
|
||||
return "AVX"; // 2011年:Sandy Bridge,256位向量
|
||||
case simd_func_version::AVX2:
|
||||
return "AVX2"; // 2013年:Haswell,完256位整数运算+FMA
|
||||
case simd_func_version::AVX512:
|
||||
return "AVX512"; // 2016年:Xeon Phi/Skylake-X,512位量
|
||||
|
||||
// ARM SIMD指令集版本
|
||||
case simd_func_version::NEON:
|
||||
return "NEON"; // ARMv8-A标准,128位量
|
||||
case simd_func_version::NEON_FP16:
|
||||
return "NEON_FP16"; // ARMv8.2-A,硬件半精度浮点支持
|
||||
|
||||
// RISC-V向量扩展
|
||||
case simd_func_version::VECTOR:
|
||||
return "VECTOR"; // RISC-V V扩展,可变长度量
|
||||
|
||||
// 默认情况:处理未知或非法的枚举值
|
||||
// 这提供了一层安全保障,虽然理论上不应到达这里
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
// 返回未知版本标识
|
||||
// 这种情况可能发生在:
|
||||
// 1. 枚举值被错误地强制转换
|
||||
// 2. 内存损坏导致枚举值异常
|
||||
// 3. 跨版本兼容性问题
|
||||
return "UNKNOWN";
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief 字符串转SIMD版本举 - 将字符串解析为版本枚举值
|
||||
* @param version_str 版本名称字符串(如"AVX2"、"NEON"等)
|
||||
* @return 对应的SIMD函数版枚举值
|
||||
*
|
||||
* 该函数将字符串表示的版本名称转换为枚举值,用于:
|
||||
* - 配置文件解析(读取用户指定的SIMD版本偏好)
|
||||
* - 命令行参数处理(--simd-version=AVX2)
|
||||
* - 环境变量解析(SIMD_VERSION=SSE4)
|
||||
* - 测试和调试(手动指定测试版本)
|
||||
*
|
||||
* 实现策略:
|
||||
* ============================================================================
|
||||
* 1. 线性字符串比较
|
||||
* - 按从高到低的版本顺序检查
|
||||
* - 使用std::string的相比较运算符
|
||||
* - 大小写敏感匹配
|
||||
*
|
||||
* 2. 回退到SCALAR版本
|
||||
* - 无法识别的字符串返回最安全的SCALAR版本
|
||||
* - 保证函数永远返回有效值
|
||||
* - 避免程序因无效输入而崩溃
|
||||
*
|
||||
* 3. 优化潜力
|
||||
* - 当前实现简单但足够高效(此函数不在热路径上)
|
||||
* - 可能的优化:哈希表查找、Trie树、完美哈希
|
||||
* - 未优化原因:此函数主要在启动时调用,性能影响可忽略
|
||||
*
|
||||
* 支持的字符串:
|
||||
* ============================================================================
|
||||
* | 输入字符串 | 返回枚举值 | 备注 |
|
||||
* |------------------|------------------------|---------------------|
|
||||
* | "SCALAR" | simd_func_version::SCALAR | 标量实现 |
|
||||
* | "SSE" | simd_func_version::SSE | SSE/SSE2 |
|
||||
* | "SSE3" | simd_func_version::SSE3 | SSE3/SSSE3 |
|
||||
* | "SSE4" | simd_func_version::SSE4 | SSE4.1/4.2 |
|
||||
* | "AVX" | simd_func_version::AVX | AVX指令集 |
|
||||
* | "AVX2" | simd_func_version::AVX2 | AVX2+FMA |
|
||||
* | "AVX512" | simd_func_version::AVX512 | AVX-512 |
|
||||
* | "NEON" | simd_func_version::NEON | ARM NEON |
|
||||
* | "NEON_FP16" | simd_func_version::NEON_FP16 | NEON半精度 |
|
||||
* | "VECTOR" | simd_func_version::VECTOR | RISC-V向量 |
|
||||
* | 其他任何字符串 | simd_func_version::SCALAR | 默认回退 |
|
||||
*
|
||||
* 使用示例:
|
||||
* ============================================================================
|
||||
* @code
|
||||
* // 配置文件解析
|
||||
* std::string config_version = config["simd_version"];
|
||||
* auto version = string_to_simd_func_version(config_version);
|
||||
*
|
||||
* // 命令行参数处理
|
||||
* if (argc > 1) {
|
||||
* auto preferred = string_to_simd_func_version(argv[1]);
|
||||
* // 使用preferred版本...
|
||||
* }
|
||||
*
|
||||
* // 环境变量解析
|
||||
* const char* env_version = std::getenv("SIMD_VERSION");
|
||||
* if (env_version) {
|
||||
* auto version = string_to_simd_func_version(env_version);
|
||||
* }
|
||||
*
|
||||
* // 测试中指定版本
|
||||
* auto test_version = string_to_simd_func_version("AVX2");
|
||||
* test_with_version(test_version);
|
||||
* @endcode
|
||||
*
|
||||
* 设计考虑:
|
||||
* ============================================================================
|
||||
* 1. 大小写敏感
|
||||
* - 当前实现要求精确匹配("AVX2"而非"avx2")
|
||||
* - 原因:保持一致性,避免混淆
|
||||
* - 改进:可添加大小写不敏感版本或预处理转换
|
||||
*
|
||||
* 2. 错误处理策略
|
||||
* - 采用"宽容"策略无效输入返回SCALAR而非抛异常
|
||||
* - 优点:避免程序崩溃,提供最低限度的功能
|
||||
* - 缺点:可能掩盖配置错误
|
||||
* - 建议:调用者应验证输入或记录回退日志
|
||||
*
|
||||
* 3. 性能特性
|
||||
* - 最坏情况:O(n) - n为版本数量(约10次字符串比较)
|
||||
* - 平均情况:取决于输入分布
|
||||
* - 优化可能:哈希表O(1),但增加复杂度
|
||||
* - 当前选择:简单性优先(此函数不在性能关键路径)
|
||||
*
|
||||
* @note 字符串比较是大小写敏感的
|
||||
* @note 无法识别的字符串会回退到SCALAR版本,不会抛出异常
|
||||
* @note 建议在调用后验证返回值是否符合预期
|
||||
* @see simd_func_version_to_string() 执行反向转换
|
||||
*/
|
||||
simd_func_version string_to_simd_func_version(const std::string& version_str) {
|
||||
// 按版本由高到低的顺序进行检查
|
||||
// 这样的顺序有助于代码可读性,但对性能无实质影响
|
||||
|
||||
// 标量版本检查
|
||||
if (version_str == "SCALAR")
|
||||
return simd_func_version::SCALAR;
|
||||
|
||||
// x86/x64 SIMD版本检查(按令集发展顺序)
|
||||
if (version_str == "SSE")
|
||||
return simd_func_version::SSE;
|
||||
if (version_str == "SSE3")
|
||||
return simd_func_version::SSE3;
|
||||
if (version_str == "SSE4")
|
||||
return simd_func_version::SSE4;
|
||||
if (version_str == "AVX")
|
||||
return simd_func_version::AVX;
|
||||
if (version_str == "AVX2")
|
||||
return simd_func_version::AVX2;
|
||||
if (version_str == "AVX512")
|
||||
return simd_func_version::AVX512;
|
||||
|
||||
// ARM SIMD版本检查
|
||||
if (version_str == "NEON")
|
||||
return simd_func_version::NEON;
|
||||
if (version_str == "NEON_FP16")
|
||||
return simd_func_version::NEON_FP16;
|
||||
|
||||
// RISC-V向量扩展检查
|
||||
if (version_str == "VECTOR")
|
||||
return simd_func_version::VECTOR;
|
||||
|
||||
// 无法识别的字符串:回退到SCALAR版本
|
||||
// 这提供了最基本的兼容性保证
|
||||
// 可能的原因:
|
||||
// - 拼写错误:"avx2"小写)、"AVX 2"(有空格)
|
||||
// - 不支持的版本名称:"AVX10"、"SSE5"
|
||||
// - 空字符串或格式错误的输入
|
||||
//
|
||||
// 注意:这里静默回退可能掩盖配置错误
|
||||
// 建议:调用者在关键场景应添加日志记录
|
||||
return simd_func_version::SCALAR; // 默认返回SCALAR
|
||||
}
|
||||
@@ -1,593 +0,0 @@
|
||||
/**
|
||||
* @file simd_func_dispatcher.h
|
||||
* @brief SIMD函数调度器 - 运行时选择最优SIMD实现的核心框架
|
||||
*
|
||||
* 本模块提供了一个强大而灵活的SIMD函数调度系统,能够:
|
||||
* - 在运行时根据CPU特性自动选择最优的SIMD实现
|
||||
* - 支持多版本函数(标量、SSE、AVX、NEON等)的统一管理
|
||||
* - 提供类型安全的函数注册和调用机制
|
||||
* - 自动回退到兼容性更好的实现版本
|
||||
*
|
||||
* 核心设计思想:
|
||||
* 1. 多版本实现:每个函数可以有多个针对不同SIMD级别的优化版本
|
||||
* 2. 运行时选择:程序启动时检测CPU特性,选择最佳版本
|
||||
* 3. 透明调用:用户调用时无需关心具体使用哪个版本
|
||||
* 4. 类型安全:使用模板和std::function确保类型匹配
|
||||
*
|
||||
* 工作流程:
|
||||
* ```
|
||||
* [注册阶段]
|
||||
* 1. 为每个函数注册多个SIMD版本的实现
|
||||
* 2. 调度器存储所有版本并根据CPU能力选择最优版本
|
||||
*
|
||||
* [调用阶段]
|
||||
* 3. 用户调用函数时,调度器自动使用预选的最优版本
|
||||
* 4. 如果最优版本不可用,自动回退到次优版本
|
||||
* ```
|
||||
*
|
||||
* 使用示例:
|
||||
* @code
|
||||
* // 注册函数的多个版本
|
||||
* REGISTER_SIMD_FUNCTION("process_audio", simd_func_version::SCALAR, scalar_impl);
|
||||
* REGISTER_SIMD_FUNCTION("process_audio", simd_func_version::AVX2, avx2_impl);
|
||||
*
|
||||
* // 获取并调用最优版本
|
||||
* auto& func = GET_SIMD_FUNCTION(void(float*, size_t), "process_audio");
|
||||
* func(data, count); // 自动使用AVX2版本(如果CPU支持)
|
||||
* @endcode
|
||||
*
|
||||
* @note 这是整个SIMD优化框架的核心组件
|
||||
* @see cpu_feature_detector, multi_version_func
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <stdexcept>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "cpu_features.h"
|
||||
|
||||
/**
|
||||
* @enum simd_func_version
|
||||
* @brief SIMD函数版本枚举 - 定义所有可能的函数实现版本
|
||||
*
|
||||
* 该枚举定义了函数可以有的所有SIMD优化版本。
|
||||
* 每个版本对应一个特定的SIMD指令集级别。
|
||||
*
|
||||
* 版本排序:
|
||||
* - 枚举值从低到高表示性能从弱到强
|
||||
* - SCALAR是最基础的版本,所有CPU都支持
|
||||
* - COUNT用于数组大小,不是实际版本
|
||||
*
|
||||
* 与simd_level的关系:
|
||||
* - simd_level表示CPU的能力级别
|
||||
* - simd_func_version表示函数的实现版本
|
||||
* - 通过simd_level_to_version()进行转换
|
||||
*
|
||||
* @note 不是所有函数都需要实现所有版本
|
||||
* @see simd_level, simd_level_to_version()
|
||||
*/
|
||||
enum class simd_func_version {
|
||||
/** 标量实现 - 纯C++代码,无SIMD优化
|
||||
* - 兼容性:所有CPU
|
||||
* - 性能:基准性能(1x)
|
||||
* - 用途:最低保底实现、参考实现
|
||||
* - 必要性:强制要求,作为回退版本
|
||||
*/
|
||||
SCALAR = 0,
|
||||
|
||||
/** SSE实现 - 使用SSE/SSE2指令
|
||||
* - 兼容性:2003年后的所有x86/x64
|
||||
* - 向量宽度:128位
|
||||
* - 性能提升:约2-4倍
|
||||
*/
|
||||
SSE,
|
||||
|
||||
/** SSE3实现 - 使用SSE3/SSSE3指令
|
||||
* - 兼容性:2006年后的主流CPU
|
||||
* - 新增功能:水平运算、复数支持
|
||||
* - 性能提升:比SSE快10-20%
|
||||
*/
|
||||
SSE3,
|
||||
|
||||
/** SSE4实现 - 使用SSE4.1/SSE4.2指令
|
||||
* - 兼容性:2008年后的主流CPU
|
||||
* - 新增功能:点积、blend、字符串处理
|
||||
* - 性能提升:比SSE3快15-30%
|
||||
*/
|
||||
SSE4,
|
||||
|
||||
/** AVX实现 - 使用AVX指令
|
||||
* - 兼容性:2011年后的主流CPU
|
||||
* - 向量宽度:256位
|
||||
* - 性能提升:约2倍SSE4性能
|
||||
*/
|
||||
AVX,
|
||||
|
||||
/** AVX2实现 - 使用AVX2 + FMA指令
|
||||
* - 兼容性:2013年后的主流CPU
|
||||
* - 新增功能:完整256位整数运算、FMA
|
||||
* - 性能提升:比AVX快50-100%
|
||||
* - 推荐:当前最佳性能/兼容性平衡点
|
||||
*/
|
||||
AVX2,
|
||||
|
||||
/** AVX-512实现 - 使用AVX-512指令集
|
||||
* - 兼容性:2016年后的高端CPU
|
||||
* - 向量宽度:512位
|
||||
* - 性能提升:约2倍AVX2性能(理论)
|
||||
* - 注意:可能导致CPU降频
|
||||
*/
|
||||
AVX512,
|
||||
|
||||
/** NEON实现 - 使用ARM NEON指令
|
||||
* - 兼容性:所有ARMv8-A (64位ARM)
|
||||
* - 向量宽度:128位
|
||||
* - 性能:与SSE4相当
|
||||
* - 应用:移动设备、Apple Silicon
|
||||
*/
|
||||
NEON,
|
||||
|
||||
/** NEON + FP16实现 - 使用NEON半精度浮点
|
||||
* - 兼容性:ARMv8.2-A及更新
|
||||
* - 新增:硬件FP16运算
|
||||
* - 性能:FP16运算快2倍
|
||||
* - 应用:移动端AI推理
|
||||
*/
|
||||
NEON_FP16,
|
||||
|
||||
/** RISC-V向量扩展实现
|
||||
* - 兼容性:支持RVV的RISC-V处理器
|
||||
* - 特点:可变向量长度
|
||||
* - 应用:嵌入式、IoT
|
||||
*/
|
||||
VECTOR,
|
||||
|
||||
/** 版本数量标记
|
||||
* 用于数组大小定义,不是实际的函数版本
|
||||
*/
|
||||
COUNT
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief 将SIMD级别转换为函数版本
|
||||
* @param level CPU的SIMD级别
|
||||
* @return 对应的函数版本枚举值
|
||||
*
|
||||
* 将cpu_feature_detector检测到的SIMD级别转换为
|
||||
* 函数调度器使用的版本标识。
|
||||
*
|
||||
* 映射关系:
|
||||
* - simd_level::NONE -> simd_func_version::SCALAR
|
||||
* - simd_level::SSE -> simd_func_version::SSE
|
||||
* - simd_level::AVX2 -> simd_func_version::AVX2
|
||||
* - 等等...
|
||||
*
|
||||
* @note constexpr函数,编译时求值,零运行时开销
|
||||
* @see simd_level, simd_func_version
|
||||
*/
|
||||
constexpr auto simd_level_to_version(simd_level level) {
|
||||
switch (level) {
|
||||
case simd_level::NONE:
|
||||
return simd_func_version::SCALAR;
|
||||
case simd_level::SSE:
|
||||
return simd_func_version::SSE;
|
||||
case simd_level::SSE3:
|
||||
return simd_func_version::SSE3;
|
||||
case simd_level::SSE4:
|
||||
return simd_func_version::SSE4;
|
||||
case simd_level::AVX:
|
||||
return simd_func_version::AVX;
|
||||
case simd_level::AVX2:
|
||||
return simd_func_version::AVX2;
|
||||
case simd_level::AVX512:
|
||||
return simd_func_version::AVX512;
|
||||
case simd_level::NEON:
|
||||
return simd_func_version::NEON;
|
||||
case simd_level::NEON_FP16:
|
||||
return simd_func_version::NEON_FP16;
|
||||
}
|
||||
|
||||
// 默认回退到标量版本
|
||||
return simd_func_version::SCALAR;
|
||||
}
|
||||
|
||||
// 前向声明
|
||||
template <typename func_signature>
|
||||
class multi_version_func;
|
||||
|
||||
/**
|
||||
* @class multi_version_func
|
||||
* @brief 多版本函数容器 - 管理同一函数的多个SIMD优化版本
|
||||
* @tparam return_type 函数返回类型
|
||||
* @tparam args 函数参数类型列表
|
||||
*
|
||||
* 该类模板存储和管理一个函数的所有SIMD版本实现,
|
||||
* 并能根据CPU能力自动选择最优版本。
|
||||
*
|
||||
* 核心功能:
|
||||
* 1. 存储多个版本:为每个SIMD级别存储一个函数实现
|
||||
* 2. 自动选择:根据CPU能力选择最优可用版本
|
||||
* 3. 智能回退:如果最优版本不可用,自动使用次优版本
|
||||
* 4. 类型安全:使用std::function确保所有版本签名一致
|
||||
*
|
||||
* 选择策略:
|
||||
* - 获取推荐的SIMD级别(考虑性能和兼容性)
|
||||
* - 从推荐级别开始,向下查找第一个可用的实现
|
||||
* - 如果都不可用,抛出异常
|
||||
*
|
||||
* 使用示例:
|
||||
* @code
|
||||
* multi_version_func<void(float*, size_t)> func;
|
||||
* func.register_version(simd_func_version::SCALAR, scalar_impl);
|
||||
* func.register_version(simd_func_version::AVX2, avx2_impl);
|
||||
*
|
||||
* // 自动选择最优版本并调用
|
||||
* func(data, count);
|
||||
* @endcode
|
||||
*
|
||||
* @note 通常不直接使用,而是通过simd_func_dispatcher管理
|
||||
* @see simd_func_dispatcher
|
||||
*/
|
||||
template <typename return_type, typename... args>
|
||||
class multi_version_func<return_type(args...)> {
|
||||
public:
|
||||
/// 函数类型定义
|
||||
using func_type = std::function<return_type(args...)>;
|
||||
/// 函数数组类型(存储所有版本)
|
||||
using func_arr = std::array<func_type, static_cast<size_t>(simd_func_version::COUNT)>;
|
||||
|
||||
/**
|
||||
* @brief 默认构造函数
|
||||
*
|
||||
* 创建空的多版本函数容器,所有版本槽位初始化为nullptr
|
||||
*/
|
||||
multi_version_func() = default;
|
||||
|
||||
/**
|
||||
* @brief 注册函数的特定版本实现
|
||||
* @param version SIMD版本标识
|
||||
* @param func 该版本的函数实现
|
||||
*
|
||||
* 将一个函数实现注册到指定的SIMD版本槽位。
|
||||
* 如果该槽位已有实现,会被覆盖。
|
||||
* 注册后会自动更新最佳函数选择。
|
||||
*
|
||||
* @note 使用std::move避免不必要的拷贝
|
||||
*/
|
||||
void register_version(simd_func_version version, func_type func) {
|
||||
functions_[static_cast<size_t>(version)] = std::move(func);
|
||||
best_func_ = get_best_func(); // 更新最佳函数
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief 获取当前最佳的函数实现
|
||||
* @return 最优函数的引用
|
||||
*
|
||||
* 根据以下策略选择最佳函数:
|
||||
* 1. 获取推荐的SIMD级别(考虑CPU特性和性能)
|
||||
* 2. 转换为函数版本枚举
|
||||
* 3. 从推荐版本开始向下查找第一个可用的实现
|
||||
* 4. 如果都不可用,返回空函数指针
|
||||
*
|
||||
* 回退顺序示例(假设推荐AVX2):
|
||||
* AVX2 -> AVX -> SSE4 -> SSE3 -> SSE -> SCALAR
|
||||
*
|
||||
* @note 该函数会被缓存到best_func_成员,避免重复查找
|
||||
*/
|
||||
const auto& get_best_func() const {
|
||||
const auto recommended_level = get_recommended_simd_level();
|
||||
const auto referred_version = simd_level_to_version(recommended_level);
|
||||
|
||||
// 从首选版本开始,向下查找可用的实现
|
||||
for (int v = static_cast<int>(referred_version); v >= 0; --v) {
|
||||
auto version = static_cast<simd_func_version>(v);
|
||||
if (const auto& func = functions_[static_cast<size_t>(version)]) { return func; }
|
||||
}
|
||||
|
||||
// 如果没有找到任何实现,返回一个空函数
|
||||
static const func_type empty_func = nullptr;
|
||||
return empty_func;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief 函数调用运算符 - 执行最优版本的函数
|
||||
* @param in_args 转发给函数的参数
|
||||
* @return 函数执行结果
|
||||
* @throws std::runtime_error 如果没有可用的实现
|
||||
*
|
||||
* 自动选择并调用最佳版本的函数实现。
|
||||
* 参数会被完美转发到实际的函数。
|
||||
*
|
||||
* @note 这使得multi_version_func对象可以像普通函数一样调用
|
||||
*/
|
||||
auto operator()(args... in_args) const {
|
||||
if (!best_func_) {
|
||||
throw std::runtime_error("没有可用的SIMD实现。");
|
||||
}
|
||||
return best_func_(std::forward<args>(in_args)...);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief 检查是否有任何版本的实现
|
||||
* @return true表示至少有一个版本已注册
|
||||
*
|
||||
* 用于验证函数是否已正确注册。
|
||||
*/
|
||||
auto has_implementation() const {
|
||||
return std::any_of(functions_.begin(), functions_.end(), [](const auto& func) { return func != nullptr; });
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief 获取所有已注册版本的列表
|
||||
* @return 已注册的版本枚举值向量
|
||||
*
|
||||
* 用于调试和状态查询,列出该函数有哪些版本的实现。
|
||||
*
|
||||
* 示例输出:[SCALAR, SSE, AVX2]
|
||||
*/
|
||||
auto get_available_versions() const {
|
||||
std::vector<simd_func_version> available_versions;
|
||||
for (size_t i = 0; i < functions_.size(); ++i) {
|
||||
if (functions_[i]) { available_versions.push_back(static_cast<simd_func_version>(i)); }
|
||||
}
|
||||
return available_versions;
|
||||
}
|
||||
|
||||
private:
|
||||
func_arr functions_{}; ///< 所有版本的函数数组
|
||||
func_type best_func_{nullptr}; ///< 缓存的最佳函数(性能优化)
|
||||
};
|
||||
|
||||
/**
|
||||
* @class simd_func_dispatcher
|
||||
* @brief SIMD函数调度器 - 全局函数注册和调度中心
|
||||
*
|
||||
* 这是整个SIMD优化框架的核心类,采用单例模式管理所有SIMD优化函数。
|
||||
*
|
||||
* 主要职责:
|
||||
* 1. 函数注册:接受多版本函数的注册
|
||||
* 2. 函数存储:使用类型擦除技术统一管理不同签名的函数
|
||||
* 3. 函数查询:根据名称和签名获取最优版本的函数
|
||||
* 4. 函数调用:提供便捷的调用接口
|
||||
* 5. 状态查询:列出所有已注册的函数及其版本
|
||||
*
|
||||
* 设计特点:
|
||||
* - 单例模式:全局唯一实例,集中管理
|
||||
* - 类型安全:模板确保函数签名匹配
|
||||
* - 类型擦除:不同签名的函数可以存储在同一容器中
|
||||
* - 延迟绑定:运行时根据CPU特性选择最优版本
|
||||
*
|
||||
* 典型工作流程:
|
||||
* ```
|
||||
* [初始化阶段]
|
||||
* 1. 程序启动时,各模块注册自己的SIMD函数
|
||||
* REGISTER_SIMD_FUNCTION("mix_audio", SCALAR, scalar_mix);
|
||||
* REGISTER_SIMD_FUNCTION("mix_audio", AVX2, avx2_mix);
|
||||
*
|
||||
* [运行阶段]
|
||||
* 2. 代码中获取并调用函数
|
||||
* auto& mix = GET_SIMD_FUNCTION(void(float*, float*, float*, size_t), "mix_audio");
|
||||
* mix(src1, src2, dst, count); // 自动使用AVX2版本
|
||||
* ```
|
||||
*
|
||||
* @note 通常通过宏REGISTER_SIMD_FUNCTION、GET_SIMD_FUNCTION使用
|
||||
* @see multi_version_func, lazy_singleton
|
||||
*/
|
||||
class simd_func_dispatcher : public lazy_singleton<simd_func_dispatcher> {
|
||||
public:
|
||||
friend class lazy_singleton<simd_func_dispatcher>;
|
||||
|
||||
/**
|
||||
* @brief 注册函数的特定版本实现
|
||||
* @tparam func_signature 函数签名类型(如void(float*, size_t))
|
||||
* @param func_name 函数名称(字符串标识)
|
||||
* @param version SIMD版本标识
|
||||
* @param func 该版本的函数实现
|
||||
*
|
||||
* 将一个函数的特定SIMD版本注册到调度器。
|
||||
* 如果该函数名第一次出现,会自动创建多版本函数容器。
|
||||
* 如果该版本已存在,会被新实现覆盖。
|
||||
*
|
||||
* @note 推荐使用REGISTER_SIMD_FUNCTION宏而不是直接调用
|
||||
*/
|
||||
template <typename func_signature>
|
||||
void register_function(const std::string& func_name,
|
||||
simd_func_version version,
|
||||
std::function<func_signature> func) {
|
||||
auto& holder = get_or_create_func<func_signature>(func_name);
|
||||
holder.register_version(version, std::move(func));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief 获取函数的最优版本
|
||||
* @tparam func_signature 函数签名类型
|
||||
* @param func_name 函数名称
|
||||
* @return 多版本函数对象的引用
|
||||
* @throws std::runtime_error 如果函数未注册
|
||||
*
|
||||
* @note 推荐使用GET_SIMD_FUNCTION宏
|
||||
*/
|
||||
template <typename func_signature>
|
||||
const auto& get_function(const std::string& func_name) const {
|
||||
const auto& it = func_registry_.find(func_name);
|
||||
if (it == func_registry_.end()) {
|
||||
throw std::runtime_error("函数 '" + func_name + "' 未注册");
|
||||
}
|
||||
|
||||
auto* holder = static_cast<func_holder<func_signature>*>(it->second.get());
|
||||
return holder->func;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief 直接调用函数(便捷接口)
|
||||
* @tparam func_signature 函数签名类型
|
||||
* @tparam args 参数类型包
|
||||
* @param func_name 函数名称
|
||||
* @param in_args 转发给函数的参数
|
||||
* @return 函数执行结果
|
||||
* @throws std::runtime_error 如果函数未注册或无可用实现
|
||||
*
|
||||
* @note 推荐使用CALL_SIMD_FUNCTION宏
|
||||
*/
|
||||
template <typename func_signature, typename... args>
|
||||
auto call_function(const std::string& func_name, args&&... in_args) const {
|
||||
const auto& func = get_function<func_signature>(func_name);
|
||||
return func(std::forward<args>(in_args)...);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief 列出所有已注册的函数名称
|
||||
* @return 函数名称列表
|
||||
*/
|
||||
[[nodiscard]] auto list_functions() const -> std::vector<std::string> {
|
||||
std::vector<std::string> func_names;
|
||||
for (const auto& pair : func_registry_) { func_names.push_back(pair.first); }
|
||||
return func_names;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief 打印所有函数的注册状态
|
||||
* @see simd_func_dispatcher.cpp 实现在cpp文件中
|
||||
*/
|
||||
void print_registry_status() const;
|
||||
|
||||
private:
|
||||
/** 函数持有者基类 - 类型擦除的基础 */
|
||||
struct func_holder_base {
|
||||
virtual ~func_holder_base() = default;
|
||||
[[nodiscard]] virtual auto get_available_versions() const -> std::vector<simd_func_version> = 0;
|
||||
[[nodiscard]] virtual auto has_implementation() const -> bool = 0;
|
||||
};
|
||||
|
||||
/** 具体的函数持有者模板 */
|
||||
template <typename func_signature>
|
||||
struct func_holder : func_holder_base {
|
||||
multi_version_func<func_signature> func;
|
||||
[[nodiscard]] auto get_available_versions() const -> std::vector<simd_func_version> override {
|
||||
return func.get_available_versions();
|
||||
}
|
||||
[[nodiscard]] auto has_implementation() const -> bool override { return func.has_implementation(); }
|
||||
};
|
||||
|
||||
/** 获取或创建函数持有者(内部辅助函数) */
|
||||
template <typename func_signature>
|
||||
auto& get_or_create_func(const std::string& func_name) {
|
||||
const auto& it = func_registry_.find(func_name);
|
||||
if (it != func_registry_.end()) {
|
||||
auto* holder = static_cast<func_holder<func_signature>*>(it->second.get());
|
||||
return holder->func;
|
||||
}
|
||||
auto holder = std::make_unique<func_holder<func_signature>>();
|
||||
auto* ptr = holder.get();
|
||||
func_registry_[func_name] = std::move(holder);
|
||||
return ptr->func;
|
||||
}
|
||||
|
||||
/// 函数注册表:函数名 -> 函数持有者的映射
|
||||
std::unordered_map<std::string, std::unique_ptr<func_holder_base>> func_registry_{};
|
||||
};
|
||||
|
||||
/**
|
||||
* @def REGISTER_SIMD_FUNCTION
|
||||
* @brief 注册SIMD函数宏 - 便捷的函数注册接口
|
||||
* @param func_name 函数名称(字符串字面量)
|
||||
* @param version SIMD版本枚举值
|
||||
* @param func 函数指针或可调用对象
|
||||
*
|
||||
* 简化函数注册的便捷宏,自动推导函数签名并注册。
|
||||
*
|
||||
* 示例:
|
||||
* @code
|
||||
* REGISTER_SIMD_FUNCTION("process", simd_func_version::SCALAR, scalar_process);
|
||||
* REGISTER_SIMD_FUNCTION("process", simd_func_version::AVX2, avx2_process);
|
||||
* @endcode
|
||||
*/
|
||||
#define REGISTER_SIMD_FUNCTION(func_name, version, func) \
|
||||
simd_func_dispatcher::instance().register_function(func_name, version, std::function(func));
|
||||
|
||||
/**
|
||||
* @def GET_SIMD_FUNCTION
|
||||
* @brief 获取SIMD函数宏 - 便捷的函数获取接口
|
||||
* @param func_signature 函数签名类型
|
||||
* @param func_name 函数名称
|
||||
* @return 多版本函数对象的引用
|
||||
*
|
||||
* 示例:
|
||||
* @code
|
||||
* auto& process = GET_SIMD_FUNCTION(void(float*, size_t), "process");
|
||||
* process(data, count);
|
||||
* @endcode
|
||||
*/
|
||||
#define GET_SIMD_FUNCTION(func_signature, func_name) \
|
||||
simd_func_dispatcher::instance().get_function<func_signature>(func_name);
|
||||
|
||||
/**
|
||||
* @def CALL_SIMD_FUNCTION
|
||||
* @brief 调用SIMD函数宏 - 便捷的函数调用接口
|
||||
* @param func_signature 函数签名类型
|
||||
* @param func_name 函数名称
|
||||
* @param ... 函数参数
|
||||
*
|
||||
* 示例:
|
||||
* @code
|
||||
* CALL_SIMD_FUNCTION(void(float*, size_t), "process", data, count);
|
||||
* @endcode
|
||||
*/
|
||||
#define CALL_SIMD_FUNCTION(func_signature, func_name, ...) \
|
||||
simd_func_dispatcher::instance().call_function<func_signature>(func_name, __VA_ARGS__);
|
||||
|
||||
/**
|
||||
* @class simd_auto_register
|
||||
* @brief SIMD自动注册助手 - 利用静态初始化自动注册函数
|
||||
* @tparam func_signature 函数签名类型
|
||||
*
|
||||
* 该类利用C++的静态初始化机制,在程序启动时自动注册函数。
|
||||
* 通常不直接使用,而是通过AUTO_REGISTER_SIMD_FUNCTION宏。
|
||||
*
|
||||
* @see AUTO_REGISTER_SIMD_FUNCTION
|
||||
*/
|
||||
template <typename func_signature>
|
||||
class simd_auto_register {
|
||||
public:
|
||||
simd_auto_register(const std::string& func_name, simd_func_version version, std::function<func_signature> func) {
|
||||
simd_func_dispatcher::instance().register_function<func_signature>(func_name, version, std::move(func));
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief 将SIMD函数版本枚举转换为字符串
|
||||
* @param version 函数版本枚举值
|
||||
* @return 本名称字符串
|
||||
*
|
||||
* 用于调试输出和日志记录。
|
||||
*
|
||||
* 示例:
|
||||
* @code
|
||||
* const char* name = simd_func_version_to_string(simd_func_version::AVX2);
|
||||
* // name = "AVX2"
|
||||
* @endcode
|
||||
*
|
||||
* @see simd_func_dispatcher.cpp
|
||||
*/
|
||||
const char* simd_func_version_to_string(simd_func_version version);
|
||||
|
||||
/**
|
||||
* @brief 将字符串转换为SIMD函数版本枚举
|
||||
* @param version_str 版本名称字符串
|
||||
* @return 对应的函数版本枚举值
|
||||
*
|
||||
* 字符串不匹配时返回simd_func_version::SCALAR。
|
||||
*
|
||||
* 示例:
|
||||
* @code
|
||||
* auto version = string_to_simd_func_version("AVX2");
|
||||
* // version = simd_func_version::AVX2
|
||||
* @endcode
|
||||
*
|
||||
* @see simd_func_dispatcher.cpp
|
||||
*/
|
||||
simd_func_version string_to_simd_func_version(const std::string& version_str);
|
||||
4
src/simd/simd_interface/CMakeLists.txt
Normal file
4
src/simd/simd_interface/CMakeLists.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
project(alicho_simd_interface)
|
||||
|
||||
simple_library(INTERFACE)
|
||||
target_compile_definitions(${PROJECT_NAME} INTERFACE SIMD_INTERFACE_EXPORTS)
|
||||
@@ -29,6 +29,7 @@
|
||||
#include <new>
|
||||
#include <cstddef>
|
||||
#include <stdexcept> // ASSERT_ALIGNED宏需要
|
||||
#include <numeric>
|
||||
|
||||
/**
|
||||
* @defgroup alignment_constants 对齐常量定义
|
||||
@@ -385,29 +386,23 @@ bool operator!=(const aligned_allocator<T1, A1>&, const aligned_allocator<T2, A2
|
||||
|
||||
/** SSE对齐分配器 (16字节对齐)
|
||||
* 适用于SSE/SSE2指令优化的代码
|
||||
* @tparam T 元素类型
|
||||
*/
|
||||
template <typename T>
|
||||
using sse_aligned_allocator = aligned_allocator<T, ALIGNMENT_SSE>;
|
||||
using sse_aligned_allocator = aligned_allocator<float, ALIGNMENT_SSE>;
|
||||
|
||||
/** AVX对齐分配器 (32字节对齐)
|
||||
* 适用于AVX/AVX2指令优化的代码
|
||||
* @tparam T 元素类型
|
||||
*
|
||||
* 示例:
|
||||
* @code
|
||||
* std::vector<float, avx_aligned_allocator<float>> data(1024);
|
||||
* @endcode
|
||||
*/
|
||||
template <typename T>
|
||||
using avx_aligned_allocator = aligned_allocator<T, ALIGNMENT_AVX>;
|
||||
using avx_aligned_allocator = aligned_allocator<float, ALIGNMENT_AVX>;
|
||||
|
||||
/** AVX-512对齐分配器 (64字节对齐)
|
||||
* 适用于AVX-512指令优化的代码
|
||||
* @tparam T 元素类型
|
||||
*/
|
||||
template <typename T>
|
||||
using avx512_aligned_allocator = aligned_allocator<T, ALIGNMENT_AVX512>;
|
||||
using avx512_aligned_allocator = aligned_allocator<float, ALIGNMENT_AVX512>;
|
||||
|
||||
/** 缓存行对齐分配器 (64字节对齐)
|
||||
* 用于避免false sharing,优化多线程性能
|
||||
17
src/simd/simd_interface/simd_export.h
Normal file
17
src/simd/simd_interface/simd_export.h
Normal file
@@ -0,0 +1,17 @@
|
||||
#pragma once
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#ifdef SIMD_INTERFACE_EXPORTS
|
||||
#define SIMD_EXPORT __declspec(dllexport)
|
||||
#else
|
||||
#define SIMD_EXPORT __declspec(dllimport)
|
||||
#endif
|
||||
#elif defined(__GNUC__)
|
||||
#ifdef SIMD_INTERFACE_EXPORTS
|
||||
#define SIMD_EXPORT __attribute__((visibility("default")))
|
||||
#else
|
||||
#define SIMD_EXPORT
|
||||
#endif
|
||||
#else
|
||||
#define SIMD_EXPORT
|
||||
#endif
|
||||
1
src/simd/simd_interface/simd_interface.cpp
Normal file
1
src/simd/simd_interface/simd_interface.cpp
Normal file
@@ -0,0 +1 @@
|
||||
#include "simd_interface.h"
|
||||
17
src/simd/simd_interface/simd_interface.h
Normal file
17
src/simd/simd_interface/simd_interface.h
Normal file
@@ -0,0 +1,17 @@
|
||||
#pragma once
|
||||
|
||||
#include "simd_export.h"
|
||||
#include <cstddef>
|
||||
|
||||
extern "C" {
|
||||
SIMD_EXPORT void fill_buffer(float* buffer, float value, size_t num_samples);
|
||||
SIMD_EXPORT void mix_audio(const float* src1, const float* src2, float* dst, size_t num_samples);
|
||||
SIMD_EXPORT void apply_gain(const float* src, float* dst, float gain, size_t num_samples);
|
||||
SIMD_EXPORT float calculate_rms(const float* src, size_t num_samples);
|
||||
SIMD_EXPORT float calculate_peak(const float* src, size_t num_samples);
|
||||
SIMD_EXPORT void normalize_audio(const float* src, float* dst, float target_peak, size_t num_samples);
|
||||
SIMD_EXPORT void stereo_to_mono(const float* stereo_src, float* mono_dst, size_t num_stereo_samples);
|
||||
SIMD_EXPORT void limit_audio(const float* src, float* dst, float threshold, float* limiter_state, float sample_rate, size_t num_samples);
|
||||
SIMD_EXPORT void fade_audio(const float* src, float* dst, size_t fade_in_samples, size_t fade_out_samples, size_t num_samples);
|
||||
SIMD_EXPORT void simple_eq(const float* src, float* dst, float low_gain, float mid_gain, float high_gain, float* eq_state, size_t num_samples);
|
||||
}
|
||||
4
src/simd/simd_scaler/CMakeLists.txt
Normal file
4
src/simd/simd_scaler/CMakeLists.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
project(alicho_simd_scaler)
|
||||
|
||||
simple_library(SHARED)
|
||||
target_link_libraries(${PROJECT_NAME} PUBLIC alicho_simd_interface)
|
||||
179
src/simd/simd_scaler/simd_func.cpp
Normal file
179
src/simd/simd_scaler/simd_func.cpp
Normal file
@@ -0,0 +1,179 @@
|
||||
#include "simd_interface.h"
|
||||
|
||||
#include <cmath>
|
||||
|
||||
extern "C"
|
||||
{
|
||||
SIMD_EXPORT void fill_buffer(float *buffer, float value, size_t num_samples)
|
||||
{
|
||||
for (size_t i = 0; i < num_samples; ++i)
|
||||
{
|
||||
buffer[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_EXPORT void mix_audio(const float *src1, const float *src2, float *dst, size_t num_samples)
|
||||
{
|
||||
for (size_t i = 0; i < num_samples; ++i)
|
||||
{
|
||||
dst[i] = src1[i] + src2[i];
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_EXPORT void apply_gain(const float *src, float *dst, float gain, size_t num_samples)
|
||||
{
|
||||
for (size_t i = 0; i < num_samples; ++i)
|
||||
{
|
||||
dst[i] = src[i] * gain;
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_EXPORT float calculate_rms(const float *src, size_t num_samples)
|
||||
{
|
||||
float sum_squares = 0.0f;
|
||||
for (size_t i = 0; i < num_samples; ++i)
|
||||
{
|
||||
sum_squares += src[i] * src[i];
|
||||
}
|
||||
return std::sqrt(sum_squares / static_cast<float>(num_samples));
|
||||
}
|
||||
|
||||
SIMD_EXPORT float calculate_peak(const float *src, size_t num_samples)
|
||||
{
|
||||
float peak = 0.0f;
|
||||
for (size_t i = 0; i < num_samples; ++i)
|
||||
{
|
||||
float abs_sample = std::fabs(src[i]);
|
||||
if (abs_sample > peak)
|
||||
{
|
||||
peak = abs_sample;
|
||||
}
|
||||
}
|
||||
return peak;
|
||||
}
|
||||
|
||||
SIMD_EXPORT void normalize_audio(const float *src, float *dst, float target_peak, size_t num_samples)
|
||||
{
|
||||
if (num_samples == 0 || target_peak <= 0.0f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
const float current_peak = calculate_peak(src, num_samples);
|
||||
|
||||
if (current_peak < 1e-10f)
|
||||
{
|
||||
for (size_t i = 0; i < num_samples; ++i)
|
||||
{
|
||||
dst[i] = 0.0f;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
const float gain_factor = target_peak / current_peak;
|
||||
apply_gain(src, dst, gain_factor, num_samples);
|
||||
}
|
||||
|
||||
SIMD_EXPORT void stereo_to_mono(const float *stereo_src, float *mono_dst, size_t num_stereo_samples)
|
||||
{
|
||||
for (size_t i = 0; i < num_stereo_samples; i += 2)
|
||||
{
|
||||
mono_dst[i / 2] = (stereo_src[i] + stereo_src[i + 1]) * 0.5f;
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_EXPORT void limit_audio(const float *src, float *dst, float threshold, float *limiter_state, float sample_rate,
|
||||
size_t num_samples)
|
||||
{
|
||||
if (num_samples == 0 || threshold <= 0.0f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
constexpr float release_time = 0.05f;
|
||||
float release_coeff = std::exp(-1.0f / (release_time * sample_rate));
|
||||
float current_gain = limiter_state != nullptr ? *limiter_state : 1.0f;
|
||||
for (size_t i = 0; i < num_samples; ++i)
|
||||
{
|
||||
float sample = src[i];
|
||||
float abs_sample = std::fabs(sample);
|
||||
|
||||
float target_gain = abs_sample > threshold ? threshold / abs_sample : 1.0f;
|
||||
|
||||
if (target_gain < current_gain)
|
||||
{
|
||||
current_gain = target_gain;
|
||||
}
|
||||
else
|
||||
{
|
||||
current_gain = target_gain + (current_gain - target_gain) * release_coeff;
|
||||
}
|
||||
|
||||
dst[i] = sample * current_gain;
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_EXPORT void fade_audio(const float *src, float *dst, size_t fade_in_samples, size_t fade_out_samples,
|
||||
size_t num_samples)
|
||||
{
|
||||
if (num_samples == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
size_t i = 0;
|
||||
if (fade_in_samples > 0)
|
||||
{
|
||||
const float fade_in_step = 1.0f / static_cast<float>(fade_in_samples);
|
||||
for (; i < std::min(fade_in_samples, num_samples); ++i)
|
||||
{
|
||||
const float gain = static_cast<float>(i) * fade_in_step;
|
||||
dst[i] = src[i] * gain;
|
||||
}
|
||||
}
|
||||
if (fade_out_samples > 0)
|
||||
{
|
||||
const size_t fade_out_start = num_samples > fade_out_samples ? num_samples - fade_out_samples : 0;
|
||||
const float fade_out_step = 1.0f / static_cast<float>(fade_out_samples);
|
||||
for (size_t j = fade_out_start; j < num_samples; ++j)
|
||||
{
|
||||
const size_t fade_out_offset = j - fade_out_start;
|
||||
const float gain = 1.0f - static_cast<float>(fade_out_offset) * fade_out_step;
|
||||
dst[j] = src[j] * gain;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_EXPORT void simple_eq(const float *src, float *dst, float low_gain, float mid_gain, float high_gain, float *eq_state, size_t num_samples)
|
||||
{
|
||||
if (num_samples == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
float low_pass_state = eq_state != nullptr ? eq_state[0] : 0.0f;
|
||||
float high_pass_state = eq_state != nullptr ? eq_state[1] : 0.0f;
|
||||
const float low_cutoff_freq = 200.0f;
|
||||
const float high_cutoff_freq = 2000.0f;
|
||||
const float sample_rate = 44100.0f;
|
||||
const float low_alpha = low_cutoff_freq / (low_cutoff_freq + sample_rate);
|
||||
const float high_alpha = sample_rate / (high_cutoff_freq + sample_rate);
|
||||
for (size_t i = 0; i < num_samples; ++i)
|
||||
{
|
||||
float sample = src[i];
|
||||
|
||||
low_pass_state += low_alpha * (sample - low_pass_state);
|
||||
float low_freq = low_pass_state;
|
||||
|
||||
high_pass_state = high_alpha * (high_pass_state + sample - (i > 0 ? src[i - 1] : 0.0f));
|
||||
float high_freq = high_pass_state;
|
||||
|
||||
float mid_freq = sample - low_freq - high_freq;
|
||||
|
||||
dst[i] = low_freq * low_gain + mid_freq * mid_gain + high_freq * high_gain;
|
||||
}
|
||||
if (eq_state != nullptr)
|
||||
{
|
||||
eq_state[0] = low_pass_state;
|
||||
eq_state[1] = high_pass_state;
|
||||
}
|
||||
}
|
||||
} // extern "C"
|
||||
7
src/simd/simd_sse/CMakeLists.txt
Normal file
7
src/simd/simd_sse/CMakeLists.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
project(alicho_simd_sse)
|
||||
|
||||
if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64")
|
||||
simple_library(SHARED)
|
||||
target_compile_options(${PROJECT_NAME} PRIVATE -msse4.2)
|
||||
target_link_libraries(${PROJECT_NAME} PUBLIC alicho_simd_interface)
|
||||
endif()
|
||||
699
src/simd/simd_sse/simd_func.cpp
Normal file
699
src/simd/simd_sse/simd_func.cpp
Normal file
@@ -0,0 +1,699 @@
|
||||
/**
|
||||
* @file x86_sse_audio_processing_func.cpp
|
||||
* @brief x86 SSE音频处理函数实现
|
||||
*/
|
||||
|
||||
#include "simd_interface.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <immintrin.h>
|
||||
#include "aligned_allocator.h"
|
||||
|
||||
extern "C"
|
||||
{
|
||||
SIMD_EXPORT void fill_buffer(float *buffer, float value, size_t num_samples)
|
||||
{
|
||||
ASSERT_ALIGNED(buffer, ALIGNMENT_SSE);
|
||||
|
||||
constexpr size_t simd_width = 4;
|
||||
constexpr size_t unroll_factor = 4;
|
||||
size_t i = 0;
|
||||
|
||||
auto value_vec = _mm_set1_ps(value);
|
||||
|
||||
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
|
||||
{
|
||||
_mm_store_ps(&buffer[i], value_vec);
|
||||
_mm_store_ps(&buffer[i + 4], value_vec);
|
||||
_mm_store_ps(&buffer[i + 8], value_vec);
|
||||
_mm_store_ps(&buffer[i + 12], value_vec);
|
||||
}
|
||||
|
||||
for (; i + simd_width <= num_samples; i += simd_width)
|
||||
{
|
||||
_mm_store_ps(&buffer[i], value_vec);
|
||||
}
|
||||
|
||||
for (; i < num_samples; ++i)
|
||||
{
|
||||
buffer[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_EXPORT void mix_audio(const float *src1, const float *src2, float *dst, size_t num_samples)
|
||||
{
|
||||
ASSERT_ALIGNED(src1, ALIGNMENT_SSE);
|
||||
ASSERT_ALIGNED(src2, ALIGNMENT_SSE);
|
||||
ASSERT_ALIGNED(dst, ALIGNMENT_SSE);
|
||||
|
||||
constexpr size_t simd_width = 4;
|
||||
constexpr size_t unroll_factor = 4;
|
||||
size_t i = 0;
|
||||
|
||||
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
|
||||
{
|
||||
auto a0 = _mm_load_ps(&src1[i]);
|
||||
auto a1 = _mm_load_ps(&src1[i + 4]);
|
||||
auto a2 = _mm_load_ps(&src1[i + 8]);
|
||||
auto a3 = _mm_load_ps(&src1[i + 12]);
|
||||
|
||||
auto b0 = _mm_load_ps(&src2[i]);
|
||||
auto b1 = _mm_load_ps(&src2[i + 4]);
|
||||
auto b2 = _mm_load_ps(&src2[i + 8]);
|
||||
auto b3 = _mm_load_ps(&src2[i + 12]);
|
||||
|
||||
auto result0 = _mm_add_ps(a0, b0);
|
||||
auto result1 = _mm_add_ps(a1, b1);
|
||||
auto result2 = _mm_add_ps(a2, b2);
|
||||
auto result3 = _mm_add_ps(a3, b3);
|
||||
|
||||
_mm_store_ps(&dst[i], result0);
|
||||
_mm_store_ps(&dst[i + 4], result1);
|
||||
_mm_store_ps(&dst[i + 8], result2);
|
||||
_mm_store_ps(&dst[i + 12], result3);
|
||||
}
|
||||
|
||||
for (; i + simd_width <= num_samples; i += simd_width)
|
||||
{
|
||||
auto a = _mm_load_ps(&src1[i]);
|
||||
auto b = _mm_load_ps(&src2[i]);
|
||||
auto result = _mm_add_ps(a, b);
|
||||
_mm_store_ps(&dst[i], result);
|
||||
}
|
||||
|
||||
for (; i < num_samples; ++i)
|
||||
{
|
||||
dst[i] = src1[i] + src2[i];
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_EXPORT void apply_gain(const float *src, float *dst, float gain, size_t num_samples)
|
||||
{
|
||||
ASSERT_ALIGNED(src, ALIGNMENT_SSE);
|
||||
ASSERT_ALIGNED(dst, ALIGNMENT_SSE);
|
||||
|
||||
constexpr size_t simd_width = 4;
|
||||
constexpr size_t unroll_factor = 4;
|
||||
size_t i = 0;
|
||||
|
||||
auto gain_vec = _mm_set1_ps(gain);
|
||||
|
||||
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
|
||||
{
|
||||
auto a0 = _mm_load_ps(&src[i]);
|
||||
auto a1 = _mm_load_ps(&src[i + 4]);
|
||||
auto a2 = _mm_load_ps(&src[i + 8]);
|
||||
auto a3 = _mm_load_ps(&src[i + 12]);
|
||||
|
||||
auto result0 = _mm_mul_ps(a0, gain_vec);
|
||||
auto result1 = _mm_mul_ps(a1, gain_vec);
|
||||
auto result2 = _mm_mul_ps(a2, gain_vec);
|
||||
auto result3 = _mm_mul_ps(a3, gain_vec);
|
||||
|
||||
_mm_store_ps(&dst[i], result0);
|
||||
_mm_store_ps(&dst[i + 4], result1);
|
||||
_mm_store_ps(&dst[i + 8], result2);
|
||||
_mm_store_ps(&dst[i + 12], result3);
|
||||
}
|
||||
|
||||
for (; i + simd_width <= num_samples; i += simd_width)
|
||||
{
|
||||
auto a = _mm_load_ps(&src[i]);
|
||||
auto result = _mm_mul_ps(a, gain_vec);
|
||||
_mm_store_ps(&dst[i], result);
|
||||
}
|
||||
|
||||
for (; i < num_samples; ++i)
|
||||
{
|
||||
dst[i] = src[i] * gain;
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_EXPORT float calculate_rms(const float *src, size_t num_samples)
|
||||
{
|
||||
ASSERT_ALIGNED(src, ALIGNMENT_SSE);
|
||||
|
||||
constexpr size_t simd_width = 4;
|
||||
constexpr size_t unroll_factor = 4;
|
||||
size_t i = 0;
|
||||
auto sum_squares0 = _mm_setzero_ps();
|
||||
auto sum_squares1 = _mm_setzero_ps();
|
||||
auto sum_squares2 = _mm_setzero_ps();
|
||||
auto sum_squares3 = _mm_setzero_ps();
|
||||
|
||||
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
|
||||
{
|
||||
const auto a0 = _mm_load_ps(&src[i]);
|
||||
const auto a1 = _mm_load_ps(&src[i + 4]);
|
||||
const auto a2 = _mm_load_ps(&src[i + 8]);
|
||||
const auto a3 = _mm_load_ps(&src[i + 12]);
|
||||
|
||||
const auto squared0 = _mm_mul_ps(a0, a0);
|
||||
const auto squared1 = _mm_mul_ps(a1, a1);
|
||||
const auto squared2 = _mm_mul_ps(a2, a2);
|
||||
const auto squared3 = _mm_mul_ps(a3, a3);
|
||||
|
||||
sum_squares0 = _mm_add_ps(sum_squares0, squared0);
|
||||
sum_squares1 = _mm_add_ps(sum_squares1, squared1);
|
||||
sum_squares2 = _mm_add_ps(sum_squares2, squared2);
|
||||
sum_squares3 = _mm_add_ps(sum_squares3, squared3);
|
||||
}
|
||||
|
||||
auto sum_squares = _mm_add_ps(_mm_add_ps(sum_squares0, sum_squares1),
|
||||
_mm_add_ps(sum_squares2, sum_squares3));
|
||||
|
||||
for (; i + simd_width <= num_samples; i += simd_width)
|
||||
{
|
||||
const auto a = _mm_load_ps(&src[i]);
|
||||
const auto squared = _mm_mul_ps(a, a);
|
||||
sum_squares = _mm_add_ps(sum_squares, squared);
|
||||
}
|
||||
|
||||
auto hadd1 = _mm_hadd_ps(sum_squares, sum_squares);
|
||||
auto hadd2 = _mm_hadd_ps(hadd1, hadd1);
|
||||
double total_sum = _mm_cvtss_f32(hadd2);
|
||||
|
||||
for (; i < num_samples; ++i)
|
||||
{
|
||||
total_sum += static_cast<double>(src[i]) * static_cast<double>(src[i]);
|
||||
}
|
||||
|
||||
return static_cast<float>(std::sqrt(total_sum / static_cast<double>(num_samples)));
|
||||
}
|
||||
|
||||
SIMD_EXPORT float calculate_peak(const float *src, size_t num_samples)
|
||||
{
|
||||
ASSERT_ALIGNED(src, ALIGNMENT_SSE);
|
||||
|
||||
constexpr size_t simd_width = 4;
|
||||
constexpr size_t unroll_factor = 4;
|
||||
size_t i = 0;
|
||||
auto peak_vec0 = _mm_setzero_ps();
|
||||
auto peak_vec1 = _mm_setzero_ps();
|
||||
auto peak_vec2 = _mm_setzero_ps();
|
||||
auto peak_vec3 = _mm_setzero_ps();
|
||||
|
||||
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
|
||||
{
|
||||
const auto a0 = _mm_load_ps(&src[i]);
|
||||
const auto a1 = _mm_load_ps(&src[i + 4]);
|
||||
const auto a2 = _mm_load_ps(&src[i + 8]);
|
||||
const auto a3 = _mm_load_ps(&src[i + 12]);
|
||||
|
||||
const auto abs_a0 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a0);
|
||||
const auto abs_a1 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a1);
|
||||
const auto abs_a2 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a2);
|
||||
const auto abs_a3 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a3);
|
||||
|
||||
peak_vec0 = _mm_max_ps(peak_vec0, abs_a0);
|
||||
peak_vec1 = _mm_max_ps(peak_vec1, abs_a1);
|
||||
peak_vec2 = _mm_max_ps(peak_vec2, abs_a2);
|
||||
peak_vec3 = _mm_max_ps(peak_vec3, abs_a3);
|
||||
}
|
||||
|
||||
auto peak_vec = _mm_max_ps(_mm_max_ps(peak_vec0, peak_vec1),
|
||||
_mm_max_ps(peak_vec2, peak_vec3));
|
||||
|
||||
for (; i + simd_width <= num_samples; i += simd_width)
|
||||
{
|
||||
const auto a = _mm_load_ps(&src[i]);
|
||||
const auto abs_a = _mm_andnot_ps(_mm_set1_ps(-0.0f), a);
|
||||
peak_vec = _mm_max_ps(peak_vec, abs_a);
|
||||
}
|
||||
|
||||
auto temp1 = _mm_shuffle_ps(peak_vec, peak_vec, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
auto max1 = _mm_max_ps(peak_vec, temp1);
|
||||
auto temp2 = _mm_shuffle_ps(max1, max1, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
auto final_max = _mm_max_ps(max1, temp2);
|
||||
float peak = _mm_cvtss_f32(final_max);
|
||||
|
||||
for (; i < num_samples; ++i)
|
||||
{
|
||||
float abs_sample = std::fabs(src[i]);
|
||||
if (abs_sample > peak)
|
||||
{
|
||||
peak = abs_sample;
|
||||
}
|
||||
}
|
||||
|
||||
return peak;
|
||||
}
|
||||
|
||||
SIMD_EXPORT void normalize_audio(const float *src, float *dst, float target_peak, size_t num_samples)
|
||||
{
|
||||
ASSERT_ALIGNED(src, ALIGNMENT_SSE);
|
||||
ASSERT_ALIGNED(dst, ALIGNMENT_SSE);
|
||||
|
||||
if (num_samples == 0 || target_peak <= 0.0f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
const float current_peak = calculate_peak(src, num_samples);
|
||||
|
||||
if (current_peak < 1e-10f)
|
||||
{
|
||||
constexpr size_t simd_width = 4;
|
||||
constexpr size_t unroll_factor = 4;
|
||||
auto zero_vec = _mm_setzero_ps();
|
||||
size_t i = 0;
|
||||
|
||||
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
|
||||
{
|
||||
_mm_store_ps(&dst[i], zero_vec);
|
||||
_mm_store_ps(&dst[i + 4], zero_vec);
|
||||
_mm_store_ps(&dst[i + 8], zero_vec);
|
||||
_mm_store_ps(&dst[i + 12], zero_vec);
|
||||
}
|
||||
|
||||
for (; i + simd_width <= num_samples; i += simd_width)
|
||||
{
|
||||
_mm_store_ps(&dst[i], zero_vec);
|
||||
}
|
||||
|
||||
for (; i < num_samples; ++i)
|
||||
{
|
||||
dst[i] = 0.0f;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
const float gain_factor = target_peak / current_peak;
|
||||
apply_gain(src, dst, gain_factor, num_samples);
|
||||
}
|
||||
|
||||
SIMD_EXPORT void stereo_to_mono(const float *stereo_src, float *mono_dst, size_t num_stereo_samples)
|
||||
{
|
||||
ASSERT_ALIGNED(stereo_src, ALIGNMENT_SSE);
|
||||
ASSERT_ALIGNED(mono_dst, ALIGNMENT_SSE);
|
||||
|
||||
if (num_stereo_samples == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr size_t simd_width = 4;
|
||||
constexpr size_t unroll_factor = 4;
|
||||
const auto half_vec = _mm_set1_ps(0.5f);
|
||||
size_t stereo_idx = 0;
|
||||
size_t mono_idx = 0;
|
||||
|
||||
for (; stereo_idx + simd_width * 2 * unroll_factor <= num_stereo_samples * 2;
|
||||
stereo_idx += simd_width * 2 * unroll_factor, mono_idx += simd_width * unroll_factor)
|
||||
{
|
||||
auto stereo0 = _mm_load_ps(&stereo_src[stereo_idx]);
|
||||
auto stereo1 = _mm_load_ps(&stereo_src[stereo_idx + 4]);
|
||||
auto stereo2 = _mm_load_ps(&stereo_src[stereo_idx + 8]);
|
||||
auto stereo3 = _mm_load_ps(&stereo_src[stereo_idx + 12]);
|
||||
auto stereo4 = _mm_load_ps(&stereo_src[stereo_idx + 16]);
|
||||
auto stereo5 = _mm_load_ps(&stereo_src[stereo_idx + 20]);
|
||||
auto stereo6 = _mm_load_ps(&stereo_src[stereo_idx + 24]);
|
||||
auto stereo7 = _mm_load_ps(&stereo_src[stereo_idx + 28]);
|
||||
|
||||
auto left0 = _mm_shuffle_ps(stereo0, stereo1, _MM_SHUFFLE(2, 0, 2, 0));
|
||||
auto right0 = _mm_shuffle_ps(stereo0, stereo1, _MM_SHUFFLE(3, 1, 3, 1));
|
||||
auto left1 = _mm_shuffle_ps(stereo2, stereo3, _MM_SHUFFLE(2, 0, 2, 0));
|
||||
auto right1 = _mm_shuffle_ps(stereo2, stereo3, _MM_SHUFFLE(3, 1, 3, 1));
|
||||
auto left2 = _mm_shuffle_ps(stereo4, stereo5, _MM_SHUFFLE(2, 0, 2, 0));
|
||||
auto right2 = _mm_shuffle_ps(stereo4, stereo5, _MM_SHUFFLE(3, 1, 3, 1));
|
||||
auto left3 = _mm_shuffle_ps(stereo6, stereo7, _MM_SHUFFLE(2, 0, 2, 0));
|
||||
auto right3 = _mm_shuffle_ps(stereo6, stereo7, _MM_SHUFFLE(3, 1, 3, 1));
|
||||
|
||||
auto mono0 = _mm_mul_ps(_mm_add_ps(left0, right0), half_vec);
|
||||
auto mono1 = _mm_mul_ps(_mm_add_ps(left1, right1), half_vec);
|
||||
auto mono2 = _mm_mul_ps(_mm_add_ps(left2, right2), half_vec);
|
||||
auto mono3 = _mm_mul_ps(_mm_add_ps(left3, right3), half_vec);
|
||||
|
||||
_mm_store_ps(&mono_dst[mono_idx], mono0);
|
||||
_mm_store_ps(&mono_dst[mono_idx + 4], mono1);
|
||||
_mm_store_ps(&mono_dst[mono_idx + 8], mono2);
|
||||
_mm_store_ps(&mono_dst[mono_idx + 12], mono3);
|
||||
}
|
||||
|
||||
for (size_t i = stereo_idx / 2; i < num_stereo_samples; ++i)
|
||||
{
|
||||
const float left = stereo_src[i * 2];
|
||||
const float right = stereo_src[i * 2 + 1];
|
||||
mono_dst[i] = (left + right) * 0.5f;
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_EXPORT void limit_audio(const float *src, float *dst, float threshold, float *limiter_state, float sample_rate,
|
||||
size_t num_samples)
|
||||
{
|
||||
ASSERT_ALIGNED(src, ALIGNMENT_SSE);
|
||||
ASSERT_ALIGNED(dst, ALIGNMENT_SSE);
|
||||
|
||||
if (num_samples == 0 || threshold <= 0.0f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr size_t simd_width = 4;
|
||||
constexpr size_t unroll_factor = 4;
|
||||
constexpr float release_time = 0.05f;
|
||||
float release_coeff = std::exp(-1.0f / (release_time * sample_rate));
|
||||
|
||||
float current_gain = limiter_state != nullptr ? *limiter_state : 1.0f;
|
||||
|
||||
size_t i = 0;
|
||||
|
||||
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
|
||||
{
|
||||
auto a0 = _mm_load_ps(&src[i]);
|
||||
auto a1 = _mm_load_ps(&src[i + 4]);
|
||||
auto a2 = _mm_load_ps(&src[i + 8]);
|
||||
auto a3 = _mm_load_ps(&src[i + 12]);
|
||||
|
||||
auto abs_a0 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a0);
|
||||
auto abs_a1 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a1);
|
||||
auto abs_a2 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a2);
|
||||
auto abs_a3 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a3);
|
||||
|
||||
auto max_abs = _mm_max_ps(_mm_max_ps(abs_a0, abs_a1),
|
||||
_mm_max_ps(abs_a2, abs_a3));
|
||||
|
||||
auto temp1 = _mm_shuffle_ps(max_abs, max_abs, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
auto max1 = _mm_max_ps(max_abs, temp1);
|
||||
auto temp2 = _mm_shuffle_ps(max1, max1, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
auto final_max = _mm_max_ps(max1, temp2);
|
||||
|
||||
float max_sample = _mm_cvtss_f32(final_max);
|
||||
|
||||
float target_gain = max_sample > threshold ? threshold / max_sample : 1.0f;
|
||||
|
||||
if (target_gain < current_gain)
|
||||
{
|
||||
current_gain = target_gain;
|
||||
}
|
||||
else
|
||||
{
|
||||
current_gain = target_gain + (current_gain - target_gain) * release_coeff;
|
||||
}
|
||||
|
||||
auto gain_vec = _mm_set1_ps(current_gain);
|
||||
|
||||
auto result0 = _mm_mul_ps(a0, gain_vec);
|
||||
auto result1 = _mm_mul_ps(a1, gain_vec);
|
||||
auto result2 = _mm_mul_ps(a2, gain_vec);
|
||||
auto result3 = _mm_mul_ps(a3, gain_vec);
|
||||
|
||||
_mm_store_ps(&dst[i], result0);
|
||||
_mm_store_ps(&dst[i + 4], result1);
|
||||
_mm_store_ps(&dst[i + 8], result2);
|
||||
_mm_store_ps(&dst[i + 12], result3);
|
||||
}
|
||||
|
||||
for (; i + simd_width <= num_samples; i += simd_width)
|
||||
{
|
||||
auto a = _mm_load_ps(&src[i]);
|
||||
auto abs_a = _mm_andnot_ps(_mm_set1_ps(-0.0f), a);
|
||||
|
||||
auto max_abs = abs_a;
|
||||
|
||||
auto temp1 = _mm_shuffle_ps(max_abs, max_abs, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
auto max1 = _mm_max_ps(max_abs, temp1);
|
||||
auto temp2 = _mm_shuffle_ps(max1, max1, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
auto final_max = _mm_max_ps(max1, temp2);
|
||||
|
||||
float max_sample = _mm_cvtss_f32(final_max);
|
||||
|
||||
float target_gain = max_sample > threshold ? threshold / max_sample : 1.0f;
|
||||
|
||||
if (target_gain < current_gain)
|
||||
{
|
||||
current_gain = target_gain;
|
||||
}
|
||||
else
|
||||
{
|
||||
current_gain = target_gain + (current_gain - target_gain) * release_coeff;
|
||||
}
|
||||
|
||||
auto gain_vec = _mm_set1_ps(current_gain);
|
||||
auto result = _mm_mul_ps(a, gain_vec);
|
||||
_mm_store_ps(&dst[i], result);
|
||||
}
|
||||
|
||||
for (; i < num_samples; ++i)
|
||||
{
|
||||
float sample = src[i];
|
||||
float abs_sample = std::fabs(sample);
|
||||
|
||||
float target_gain = abs_sample > threshold ? threshold / abs_sample : 1.0f;
|
||||
|
||||
if (target_gain < current_gain)
|
||||
{
|
||||
current_gain = target_gain;
|
||||
}
|
||||
else
|
||||
{
|
||||
current_gain = target_gain + (current_gain - target_gain) * release_coeff;
|
||||
}
|
||||
|
||||
dst[i] = sample * current_gain;
|
||||
}
|
||||
|
||||
if (limiter_state != nullptr)
|
||||
{
|
||||
*limiter_state = current_gain;
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_EXPORT void fade_audio(const float *src, float *dst, size_t fade_in_samples, size_t fade_out_samples,
|
||||
size_t num_samples)
|
||||
{
|
||||
ASSERT_ALIGNED(src, ALIGNMENT_SSE);
|
||||
ASSERT_ALIGNED(dst, ALIGNMENT_SSE);
|
||||
|
||||
if (num_samples == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr size_t simd_width = 4;
|
||||
constexpr size_t unroll_factor = 4;
|
||||
size_t i = 0;
|
||||
|
||||
if (fade_in_samples > 0)
|
||||
{
|
||||
const float fade_in_step = 1.0f / static_cast<float>(fade_in_samples);
|
||||
|
||||
for (; i + simd_width * unroll_factor <= std::min(fade_in_samples, num_samples); i += simd_width *
|
||||
unroll_factor)
|
||||
{
|
||||
auto gain0 = _mm_set_ps((i + 3) * fade_in_step, (i + 2) * fade_in_step, (i + 1) * fade_in_step,
|
||||
i * fade_in_step);
|
||||
auto gain1 = _mm_set_ps((i + 7) * fade_in_step, (i + 6) * fade_in_step, (i + 5) * fade_in_step,
|
||||
(i + 4) * fade_in_step);
|
||||
auto gain2 = _mm_set_ps((i + 11) * fade_in_step, (i + 10) * fade_in_step, (i + 9) * fade_in_step,
|
||||
(i + 8) * fade_in_step);
|
||||
auto gain3 = _mm_set_ps((i + 15) * fade_in_step, (i + 14) * fade_in_step, (i + 13) * fade_in_step,
|
||||
(i + 12) * fade_in_step);
|
||||
|
||||
auto a0 = _mm_load_ps(&src[i]);
|
||||
auto a1 = _mm_load_ps(&src[i + 4]);
|
||||
auto a2 = _mm_load_ps(&src[i + 8]);
|
||||
auto a3 = _mm_load_ps(&src[i + 12]);
|
||||
|
||||
auto result0 = _mm_mul_ps(a0, gain0);
|
||||
auto result1 = _mm_mul_ps(a1, gain1);
|
||||
auto result2 = _mm_mul_ps(a2, gain2);
|
||||
auto result3 = _mm_mul_ps(a3, gain3);
|
||||
|
||||
_mm_store_ps(&dst[i], result0);
|
||||
_mm_store_ps(&dst[i + 4], result1);
|
||||
_mm_store_ps(&dst[i + 8], result2);
|
||||
_mm_store_ps(&dst[i + 12], result3);
|
||||
}
|
||||
|
||||
for (; i < std::min(fade_in_samples, num_samples); ++i)
|
||||
{
|
||||
const float gain = static_cast<float>(i) / static_cast<float>(fade_in_samples);
|
||||
dst[i] = src[i] * gain;
|
||||
}
|
||||
}
|
||||
|
||||
const size_t middle_start = fade_in_samples;
|
||||
const size_t middle_end = num_samples > fade_out_samples ? num_samples - fade_out_samples : 0;
|
||||
|
||||
if (middle_end > middle_start)
|
||||
{
|
||||
for (size_t j = middle_start; j + simd_width * unroll_factor <= middle_end; j += simd_width *
|
||||
unroll_factor)
|
||||
{
|
||||
auto a0 = _mm_load_ps(&src[j]);
|
||||
auto a1 = _mm_load_ps(&src[j + 4]);
|
||||
auto a2 = _mm_load_ps(&src[j + 8]);
|
||||
auto a3 = _mm_load_ps(&src[j + 12]);
|
||||
|
||||
_mm_store_ps(&dst[j], a0);
|
||||
_mm_store_ps(&dst[j + 4], a1);
|
||||
_mm_store_ps(&dst[j + 8], a2);
|
||||
_mm_store_ps(&dst[j + 12], a3);
|
||||
}
|
||||
|
||||
for (size_t j = middle_start + ((middle_end - middle_start) / (simd_width * unroll_factor)) * (simd_width *
|
||||
unroll_factor);
|
||||
j < middle_end; ++j)
|
||||
{
|
||||
dst[j] = src[j];
|
||||
}
|
||||
}
|
||||
|
||||
if (fade_out_samples > 0 && num_samples > fade_out_samples)
|
||||
{
|
||||
const size_t fade_out_start = num_samples - fade_out_samples;
|
||||
const float fade_out_step = 1.0f / static_cast<float>(fade_out_samples);
|
||||
|
||||
for (size_t j = fade_out_start; j + simd_width * unroll_factor <= num_samples; j += simd_width *
|
||||
unroll_factor)
|
||||
{
|
||||
const size_t fade_out_offset = j - fade_out_start;
|
||||
auto gain0 = _mm_set_ps(1.0f - (fade_out_offset + 3) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 2) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 1) * fade_out_step,
|
||||
1.0f - fade_out_offset * fade_out_step);
|
||||
auto gain1 = _mm_set_ps(1.0f - (fade_out_offset + 7) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 6) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 5) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 4) * fade_out_step);
|
||||
auto gain2 = _mm_set_ps(1.0f - (fade_out_offset + 11) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 10) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 9) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 8) * fade_out_step);
|
||||
auto gain3 = _mm_set_ps(1.0f - (fade_out_offset + 15) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 14) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 13) * fade_out_step,
|
||||
1.0f - (fade_out_offset + 12) * fade_out_step);
|
||||
|
||||
auto a0 = _mm_load_ps(&src[j]);
|
||||
auto a1 = _mm_load_ps(&src[j + 4]);
|
||||
auto a2 = _mm_load_ps(&src[j + 8]);
|
||||
auto a3 = _mm_load_ps(&src[j + 12]);
|
||||
|
||||
auto result0 = _mm_mul_ps(a0, gain0);
|
||||
auto result1 = _mm_mul_ps(a1, gain1);
|
||||
auto result2 = _mm_mul_ps(a2, gain2);
|
||||
auto result3 = _mm_mul_ps(a3, gain3);
|
||||
|
||||
_mm_store_ps(&dst[j], result0);
|
||||
_mm_store_ps(&dst[j + 4], result1);
|
||||
_mm_store_ps(&dst[j + 8], result2);
|
||||
_mm_store_ps(&dst[j + 12], result3);
|
||||
}
|
||||
|
||||
for (size_t j = fade_out_start + ((fade_out_samples / (simd_width * unroll_factor)) * (simd_width *
|
||||
unroll_factor));
|
||||
j < num_samples; ++j)
|
||||
{
|
||||
const size_t fade_out_offset = j - fade_out_start;
|
||||
const float gain = 1.0f - static_cast<float>(fade_out_offset) / static_cast<float>(fade_out_samples);
|
||||
dst[j] = src[j] * gain;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_EXPORT void simple_eq(const float *src, float *dst, float low_gain, float mid_gain, float high_gain, float *eq_state,
|
||||
size_t num_samples)
|
||||
{
|
||||
ASSERT_ALIGNED(src, ALIGNMENT_SSE);
|
||||
ASSERT_ALIGNED(dst, ALIGNMENT_SSE);
|
||||
|
||||
if (num_samples == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr size_t simd_width = 4;
|
||||
constexpr size_t unroll_factor = 4;
|
||||
|
||||
constexpr float low_cutoff = 0.02f;
|
||||
constexpr float high_cutoff = 0.1f;
|
||||
constexpr float mid_factor = 0.7f;
|
||||
|
||||
float low_state = eq_state != nullptr ? *eq_state : 0.0f;
|
||||
float high_state = eq_state != nullptr ? *(eq_state + 1) : 0.0f;
|
||||
|
||||
const auto low_gain_vec = _mm_set1_ps(low_gain);
|
||||
const auto mid_gain_vec = _mm_set1_ps(mid_gain);
|
||||
const auto high_gain_vec = _mm_set1_ps(high_gain);
|
||||
const auto low_cutoff_vec = _mm_set1_ps(low_cutoff);
|
||||
const auto high_cutoff_vec = _mm_set1_ps(high_cutoff);
|
||||
const auto mid_factor_vec = _mm_set1_ps(mid_factor);
|
||||
const auto one_minus_low_cutoff_vec = _mm_set1_ps(1.0f - low_cutoff);
|
||||
const auto one_minus_high_cutoff_vec = _mm_set1_ps(1.0f - high_cutoff);
|
||||
|
||||
size_t i = 0;
|
||||
|
||||
for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor)
|
||||
{
|
||||
auto input0 = _mm_load_ps(&src[i]);
|
||||
auto input1 = _mm_load_ps(&src[i + 4]);
|
||||
auto input2 = _mm_load_ps(&src[i + 8]);
|
||||
auto input3 = _mm_load_ps(&src[i + 12]);
|
||||
|
||||
auto low_state_vec = _mm_set1_ps(low_state);
|
||||
auto low0 = _mm_add_ps(_mm_mul_ps(input0, low_cutoff_vec),
|
||||
_mm_mul_ps(low_state_vec, one_minus_low_cutoff_vec));
|
||||
auto low1 = _mm_add_ps(_mm_mul_ps(input1, low_cutoff_vec), _mm_mul_ps(low0, one_minus_low_cutoff_vec));
|
||||
auto low2 = _mm_add_ps(_mm_mul_ps(input2, low_cutoff_vec), _mm_mul_ps(low1, one_minus_low_cutoff_vec));
|
||||
auto low3 = _mm_add_ps(_mm_mul_ps(input3, low_cutoff_vec), _mm_mul_ps(low2, one_minus_low_cutoff_vec));
|
||||
|
||||
auto high0 = _mm_sub_ps(input0, low0);
|
||||
auto high1 = _mm_sub_ps(input1, low1);
|
||||
auto high2 = _mm_sub_ps(input2, low2);
|
||||
auto high3 = _mm_sub_ps(input3, low3);
|
||||
|
||||
auto high_state_vec = _mm_set1_ps(high_state);
|
||||
high0 = _mm_add_ps(_mm_mul_ps(high0, high_cutoff_vec),
|
||||
_mm_mul_ps(high_state_vec, one_minus_high_cutoff_vec));
|
||||
high1 = _mm_add_ps(_mm_mul_ps(high1, high_cutoff_vec), _mm_mul_ps(high0, one_minus_high_cutoff_vec));
|
||||
high2 = _mm_add_ps(_mm_mul_ps(high2, high_cutoff_vec), _mm_mul_ps(high1, one_minus_high_cutoff_vec));
|
||||
high3 = _mm_add_ps(_mm_mul_ps(high3, high_cutoff_vec), _mm_mul_ps(high2, one_minus_high_cutoff_vec));
|
||||
|
||||
auto mid0 = _mm_mul_ps(_mm_sub_ps(_mm_sub_ps(input0, low0), high0), mid_factor_vec);
|
||||
auto mid1 = _mm_mul_ps(_mm_sub_ps(_mm_sub_ps(input1, low1), high1), mid_factor_vec);
|
||||
auto mid2 = _mm_mul_ps(_mm_sub_ps(_mm_sub_ps(input2, low2), high2), mid_factor_vec);
|
||||
auto mid3 = _mm_mul_ps(_mm_sub_ps(_mm_sub_ps(input3, low3), high3), mid_factor_vec);
|
||||
|
||||
auto result0 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(low0, low_gain_vec), _mm_mul_ps(mid0, mid_gain_vec)),
|
||||
_mm_mul_ps(high0, high_gain_vec));
|
||||
auto result1 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(low1, low_gain_vec), _mm_mul_ps(mid1, mid_gain_vec)),
|
||||
_mm_mul_ps(high1, high_gain_vec));
|
||||
auto result2 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(low2, low_gain_vec), _mm_mul_ps(mid2, mid_gain_vec)),
|
||||
_mm_mul_ps(high2, high_gain_vec));
|
||||
auto result3 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(low3, low_gain_vec), _mm_mul_ps(mid3, mid_gain_vec)),
|
||||
_mm_mul_ps(high3, high_gain_vec));
|
||||
|
||||
_mm_store_ps(&dst[i], result0);
|
||||
_mm_store_ps(&dst[i + 4], result1);
|
||||
_mm_store_ps(&dst[i + 8], result2);
|
||||
_mm_store_ps(&dst[i + 12], result3);
|
||||
|
||||
low_state = _mm_cvtss_f32(_mm_shuffle_ps(low3, low3, _MM_SHUFFLE(3, 3, 3, 3)));
|
||||
high_state = _mm_cvtss_f32(_mm_shuffle_ps(high3, high3, _MM_SHUFFLE(3, 3, 3, 3)));
|
||||
}
|
||||
|
||||
for (; i < num_samples; ++i)
|
||||
{
|
||||
float input = src[i];
|
||||
|
||||
float low_output = low_cutoff * input + (1.0f - low_cutoff) * low_state;
|
||||
low_state = low_output;
|
||||
|
||||
float high_input = input - low_output;
|
||||
float high_output = high_cutoff * high_input + (1.0f - high_cutoff) * high_state;
|
||||
high_state = high_output;
|
||||
|
||||
float mid_output = (input - low_output - high_output) * mid_factor;
|
||||
|
||||
dst[i] = low_output * low_gain + mid_output * mid_gain + high_output * high_gain;
|
||||
}
|
||||
|
||||
if (eq_state != nullptr)
|
||||
{
|
||||
*eq_state = low_state;
|
||||
*(eq_state + 1) = high_state;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -10,6 +10,7 @@ enable_testing()
|
||||
include(cmake/test_helpers.cmake)
|
||||
|
||||
# 添加各测试模块
|
||||
add_subdirectory(helpers)
|
||||
add_subdirectory(simd)
|
||||
add_subdirectory(network)
|
||||
add_subdirectory(shm)
|
||||
|
||||
4
tests/helpers/CMakeLists.txt
Normal file
4
tests/helpers/CMakeLists.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
project(alicho_test_helper)
|
||||
|
||||
simple_library(STATIC)
|
||||
target_link_libraries(${PROJECT_NAME} PUBLIC GTest::gtest GTest::gtest_main audio_backend_project_options)
|
||||
@@ -7,7 +7,7 @@ add_module_test(
|
||||
TARGET test_simd_basic
|
||||
SOURCE_FILE test_simd_basic.cpp
|
||||
MODULE "SIMD"
|
||||
LINK_LIBRARIES alicho_simd
|
||||
LINK_LIBRARIES alicho_simd alicho_simd_interface alicho_test_helper
|
||||
)
|
||||
|
||||
# SIMD 音频处理测试
|
||||
@@ -15,7 +15,7 @@ add_module_test(
|
||||
TARGET test_simd_audio_processing
|
||||
SOURCE_FILE test_simd_audio_processing.cpp
|
||||
MODULE "SIMD"
|
||||
LINK_LIBRARIES alicho_simd
|
||||
LINK_LIBRARIES alicho_simd alicho_simd_interface alicho_test_helper
|
||||
)
|
||||
|
||||
# 自定义目标:运行 SIMD 测试
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user