diff --git a/.gitignore b/.gitignore index 1c431ad..18a6f9d 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ .DS_Store /build/ /.vs +/out +/logs \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index b3d36b7..bfc108f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,7 +15,6 @@ include(cmake/compiler_options.cmake) include(cmake/mingw_dll.cmake) configure_project_defaults() -configure_simd_optimizations() setup_project_options( STANDARD 20 INTERFACE_TARGET audio_backend_project_options diff --git a/CMakePresets.json b/CMakePresets.json index daa69af..e2b5940 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -15,6 +15,19 @@ "CMAKE_CXX_COMPILER": "cl.exe", "CMAKE_TOOLCHAIN_FILE": "$env{VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" } + }, + { + "name": "clang", + "displayName": "Clang 20.1.8 x86_64-pc-linux-gnu", + "description": "正在使用编译器: C = /usr/bin/clang, CXX = /usr/bin/clang++", + "binaryDir": "${sourceDir}/out/build/${presetName}", + "cacheVariables": { + "CMAKE_INSTALL_PREFIX": "${sourceDir}/out/install/${presetName}", + "CMAKE_C_COMPILER": "/usr/bin/clang", + "CMAKE_CXX_COMPILER": "/usr/bin/clang++", + "CMAKE_BUILD_TYPE": "Debug", + "CMAKE_TOOLCHAIN_FILE": "$env{VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" + } } ], "buildPresets": [ diff --git a/CMakeUserPresets.json b/CMakeUserPresets.json new file mode 100644 index 0000000..1494ce8 --- /dev/null +++ b/CMakeUserPresets.json @@ -0,0 +1,12 @@ +{ + "version": 3, + "configurePresets": [ + { + "name": "clang-local", + "inherits": "clang", + "environment": { + "VCPKG_ROOT": "/home/vcpkg" + } + } + ] +} \ No newline at end of file diff --git a/cmake/compiler_options.cmake b/cmake/compiler_options.cmake index ae22b32..8b3e6df 100644 --- a/cmake/compiler_options.cmake +++ b/cmake/compiler_options.cmake @@ -74,83 +74,19 @@ function(configure_compiler_options) message(STATUS "启用彩色诊断和完整模板回溯") endif() endif() -endfunction() -# ================================================================================================ -# 配置SIMD优化(扩展AVX512支持) -# ================================================================================================ -function(configure_simd_optimizations) - if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64") - # 检测编译器支持 - include(CheckCXXCompilerFlag) - - # AVX2支持(保持现有) - if(MSVC) - check_cxx_compiler_flag("/arch:AVX2" COMPILER_SUPPORTS_AVX2) - if(COMPILER_SUPPORTS_AVX2) - add_compile_options(/arch:AVX2) - add_compile_definitions(DAW_ENABLE_AVX2) - message(STATUS "SIMD优化: 启用AVX2指令集") - endif() - else() - check_cxx_compiler_flag("-mavx2" COMPILER_SUPPORTS_AVX2) - if(COMPILER_SUPPORTS_AVX2) - add_compile_options(-mavx2 -mfma) - add_compile_definitions(DAW_ENABLE_AVX2) - message(STATUS "SIMD优化: 启用AVX2指令集") - endif() - endif() - - # AVX512支持(新增) - if(MSVC) - check_cxx_compiler_flag("/arch:AVX512" COMPILER_SUPPORTS_AVX512) - if(COMPILER_SUPPORTS_AVX512) - add_compile_options(/arch:AVX512) - add_compile_definitions(DAW_ENABLE_AVX512) - message(STATUS "SIMD优化: 启用AVX512指令集") - else() - message(WARNING "编译器不支持AVX512,降级到AVX2") - endif() - else() - check_cxx_compiler_flag("-mavx512f" COMPILER_SUPPORTS_AVX512F) - check_cxx_compiler_flag("-mavx512vl" COMPILER_SUPPORTS_AVX512VL) - check_cxx_compiler_flag("-mavx512bw" COMPILER_SUPPORTS_AVX512BW) - - if(COMPILER_SUPPORTS_AVX512F AND COMPILER_SUPPORTS_AVX512VL) - add_compile_options(-mavx512f -mavx512vl) - add_compile_definitions(DAW_ENABLE_AVX512) - - if(COMPILER_SUPPORTS_AVX512BW) - add_compile_options(-mavx512bw) - add_compile_definitions(DAW_ENABLE_AVX512BW) - endif() - - message(STATUS "SIMD优化: 启用AVX512指令集") - else() - message(WARNING "编译器不支持完整AVX512,降级到AVX2") - endif() - endif() - elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64|ARM64") - # ARM平台:检测NEON支持 - include(CheckCXXCompilerFlag) - - if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|ARM64") - # AArch64: NEON默认可用 - add_compile_definitions(DAW_ENABLE_NEON) - message(STATUS "SIMD优化: 启用ARM64 NEON指令集") - else() - # ARM32: 检测NEON支持 - check_cxx_compiler_flag("-mfpu=neon" COMPILER_SUPPORTS_NEON) - if(COMPILER_SUPPORTS_NEON) - add_compile_options(-mfpu=neon) - add_compile_definitions(DAW_ENABLE_NEON) - message(STATUS "SIMD优化: 启用ARM32 NEON指令集") - else() - message(STATUS "SIMD优化: ARM32平台不支持NEON") - endif() - endif() - else() - message(STATUS "SIMD优化: 当前架构(${CMAKE_SYSTEM_PROCESSOR})不支持SIMD优化") + if (MSVC) + add_compile_definitions(ALICHO_MSVC=1) + add_compile_definitions(ALICHO_GCC=0) + add_compile_definitions(ALICHO_CLANG=0) + elseif (CMAKE_CXX_COMPILER_ID MATCHES "GNU") + add_compile_definitions(ALICHO_MSVC=0) + add_compile_definitions(ALICHO_GCC=1) + add_compile_definitions(ALICHO_CLANG=0) + elseif (CMAKE_CXX_COMPILER_ID MATCHES "Clang") + add_compile_definitions(ALICHO_MSVC=0) + add_compile_definitions(ALICHO_GCC=0) + add_compile_definitions(ALICHO_CLANG=1) endif() endfunction() @@ -159,6 +95,5 @@ endfunction() # ================================================================================================ function(apply_compiler_configuration) configure_compiler_options() - configure_simd_optimizations() message(STATUS "编译器配置完成") endfunction() \ No newline at end of file diff --git a/cmake/detect_os.cmake b/cmake/detect_os.cmake index c15c1e8..a1a3fbc 100644 --- a/cmake/detect_os.cmake +++ b/cmake/detect_os.cmake @@ -142,7 +142,12 @@ function(add_os_definitions target) # --- 阶段 3: 应用所有定义 --- # **关键:使用一次调用将所有定义添加到目标** if(definitions_list) # 确保列表非空 - target_compile_definitions(${target} PUBLIC ${definitions_list}) + get_target_property(target_type ${target} TYPE) + if(target_type STREQUAL "INTERFACE_LIBRARY") + target_compile_definitions(${target} INTERFACE ${definitions_list}) + else() + target_compile_definitions(${target} PUBLIC ${definitions_list}) + endif() endif() # 函数作用域结束时,alicho_def_* 变量会自动销毁,无需显式 unset diff --git a/cmake/retrieve_files.cmake b/cmake/retrieve_files.cmake index 9b216be..d4fd7cc 100644 --- a/cmake/retrieve_files.cmake +++ b/cmake/retrieve_files.cmake @@ -567,8 +567,13 @@ function(simple_library library_type) set(source_files "") retrieve_files(${CMAKE_CURRENT_SOURCE_DIR} source_files) add_library(${PROJECT_NAME} ${library_type} ${source_files}) - target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) - target_link_libraries(${PROJECT_NAME} PUBLIC audio_backend_project_options) + if(library_type STREQUAL "INTERFACE") + target_include_directories(${PROJECT_NAME} INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}) + target_link_libraries(${PROJECT_NAME} INTERFACE audio_backend_project_options) + else() + target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) + target_link_libraries(${PROJECT_NAME} PUBLIC audio_backend_project_options) + endif() message(STATUS "创建库目标: ${PROJECT_NAME},类型: ${library_type},引用路径: ${CMAKE_CURRENT_SOURCE_DIR}") add_os_definitions(${PROJECT_NAME}) endfunction() diff --git a/docs/sandbox_architecture_redesign.md b/docs/sandbox_architecture_redesign.md deleted file mode 100644 index fda506e..0000000 --- a/docs/sandbox_architecture_redesign.md +++ /dev/null @@ -1,1361 +0,0 @@ -# 沙箱管理架构重构设计文档 - -**版本**: 1.0 -**日期**: 2025-11-05 -**作者**: Alicho 架构团队 -**状态**: 设计阶段 - ---- - -## 目录 - -1. [架构概览](#1-架构概览) -2. [组件设计](#2-组件设计) -3. [数据流设计](#3-数据流设计) -4. [接口设计](#4-接口设计) -5. [实施策略](#5-实施策略) -6. [性能考虑](#6-性能考虑) - ---- - -## 1. 架构概览 - -### 1.1 设计目标 - -将当前的**单沙箱-多插件**架构改造为**灵活的多沙箱架构**,实现以下核心目标: - -- ✅ **灵活的插件-沙箱映射**:支持相同插件共享沙箱或每个实例独立沙箱 -- ✅ **任意音频路由**支持插件间的任意音频路由,包括串行、并行和复杂路由图 -- ✅ **动态生命周期管**:插件加载时创建沙箱,卸载时销毁 -- ✅ **故障隔离**:插崩溃不影响其他插件 -- ✅ **智能崩溃恢复**支持通知用户和自动重启两种恢复策略 - -### 1.2 核心设计原则 - -1. **最小侵入性**:可能复用现有组件([`process_manager`](../src/process_manager/process_manager.h:21)、[`audio_processing_task`](../src/network/transport/audio_processing_shm.h:349)) -2. **模块化设计**:组件职责清晰,易于测试和维护 -3. **性能优先**:共内存 + 无锁数据结构,保证实时性能 -4. **灵活性**:支持种插件-沙箱映射策略和音频路由模式 -5. **可观测性**:完的日志、监控和错误报告机制 - -### 1.3 架构组件图 - -``` -┌────────────────────────────────────────────────────────────────┐ -│ Engine 进程 │ -│ │ -│ ┌──────────────┐ ┌─────────────────┐ │ -│ │PluginManager │◄────►│ SandboxManager │ │ -│ │(用户接口) │ │(核心管理器) │ │ -│ └──────────────┘ └────────┬────────┘ │ -│ │ │ -│ ┌────────────┼─────────────┐ │ -│ ▼ ▼ ▼ │ -│ ┌─────────────┐ ┌──────────────┐ ┌──────────┐ │ -│ │AudioRouter │ │ProcessManager│ │RPCRouter │ │ -│ │(音频路由) │ │(进程理) │ │(消息路由)│ │ -│ └─────────────┘ └──────────────┘ └──────────┘ │ -│ │ │ │ │ -└────────────────┼──────────────┼────────────────┼────────────────┘ - │ │ │ - │ 共享内存 │ 进程控制 │ RPC通信 - ▼ ▼ ▼ - ┌────────────────────────────────────────────────────┐ - │ 沙箱进程 1 (插件 A) │ - │ ┌──────────┐ ┌──────────┐ ┌─────────────┐ │ - │ │host_app │─►│Plugin A-1│ │RPC Client │ │ - │ │ │ │Plugin A-2│ │ │ │ - │ └────┬─────┘ └──────────┘ └─────────────┘ │ - │ │ 共享内存段 shm_sandbox_1 │ - └───────┼────────────────────────────────────────────┘ - │ - ┌───────┼────────────────────────────────────────────┐ - │ │ 沙箱进程 2 (插件 B) │ - │ ┌────▼─────┐ ┌──────────┐ ┌─────────────┐ │ - │ │host_app │─►│Plugin B-1│ │RPC Client │ │ - │ └──────────┘ └──────────┘ └─────────────┘ │ - │ 共享内存段 shm_sandbox_2 │ - └────────────────────────────────────────────────────┘ -``` - -### 1.4 新旧架构对比 - -| 维度 | 旧架构(单沙箱) | 新架构(多沙箱) | -|------|------------------|------------------| -| **沙箱数量** | 1 个 | N 个(按需创建) | -| **插件映射** | 所有插件 → 单一沙箱 | 灵活映射(享/独立) | -| **故障隔离** | ❌ 一个插件崩溃影响所有插件 | ✅ 插件崩溃互不影响 | -| **音频路由** | 简单的串行处理 | ✅ 任意路由图 | -| **动态性** | 静态启动 | ✅ 动态创建/销毁 | -| **资源管理** | 集中管理 | 分散管理,精细控制 | -| **崩溃恢复** | 需重启整个沙箱 | ✅ 独立恢复单个插件 | -| **复杂度** | 低 | 中等 | - ---- - -## 2. 组件设计 - -### 2.1 SandboxManager(沙箱理器) - -#### 2.1.1 职责 - -`SandboxManager` 是新架构的核心组件,负责: - -1. **插件-沙箱映射管理**:维护插件实例到沙箱进程的映射关系 -2. **沙箱生命周期管**:创建、监控、销毁沙箱进程 -3. **资源追踪**:跟每个沙箱的资源使用情况 -4. **崩溃恢复协调**处理沙箱崩溃并执行恢复策略 - -#### 2.1.2 类设计 - -**文件位置**:[`src/engine/plugin/sandbox_manager.h`](../src/engine/plugin/sandbox_manager.h) (新建) - -```cpp -#pragma once -#include -#include -#include -#include "lazy_singleton.h" -#include "process_manager.h" -#include "plugin_type.h" - -namespace alicho { - -/** - * @brief 沙箱共享策略 - */ -enum class sandbox_sharing_policy { - SHARED, ///< 相同插件的实例共享沙箱 - ISOLATED, ///< 每个插件实例独立沙箱 - AUTO ///< 根据插件特性自动决定(未来扩展) -}; - -/** - * @brief 崩溃恢复策略 - */ -enum class crash_recovery_policy { - NOTIFY_ONLY, ///< 仅通知用户,不自动恢复 - AUTO_RESTART_ONCE, ///< 自动重启一次,失败后通知 - AUTO_RESTART, ///< 始终自动重启 -}; - -/** - * @brief 沙箱配置 - */ -struct sandbox_config { - sandbox_sharing_policy sharing_policy = sandbox_sharing_policy::SHARED; - crash_recovery_policy recovery_policy = crash_recovery_policy::AUTO_RESTART_ONCE; - uint32_t max_restart_attempts = 3; - std::chrono::milliseconds restart_cooldown{1000}; -}; - -/** - * @brief 沙箱实例信息 - */ -struct sandbox_instance { - uint32_t sandbox_id; ///< 沙箱唯一ID - uint32_t process_id; ///< 进程ID - std::string shm_name; ///< 共享内存名称 - plugin_type type; ///< 插件类型 - std::vector plugin_instances; ///< 该沙箱中的插件实例ID列表 - uint32_t restart_count = 0; ///< 重启次数 - std::chrono::system_clock::time_point last_restart_time; - - bool is_alive() const; - size_t plugin_count() const { return plugin_instances.size(); } -}; - -/** - * @brief 沙箱管理器 - * - * 负责管理所有插件沙箱的生命周期、映射关系和故障恢复 - */ -class sandbox_manager : public lazy_singleton { - friend class lazy_singleton; - -public: - /** - * @brief 为插件实例创建或获取沙箱 - * @param plugin_id 插件ID - * @param instance_id 插件实例ID(唯一识) - * @param plugin_path 插件文件路径 - * @param type 插件类型 - * @return sandbox_id 沙箱ID,失败返回0 - */ - uint32_t acquire_sandbox( - uint64_t plugin_id, - uint64_t instance_id, - const std::filesystem::path& plugin_path, - plugin_type type - ); - - /** - * @brief 释放插件实例的沙箱 - * @param instance_id 插件实例ID - * @return 否成功 - * - * 如果沙箱中没有其他插件实例,则销毁沙箱 - */ - bool release_sandbox(uint64_t instance_id); - - /** - * @brief 获取插件实例所在的沙箱ID - * @param instance_id 插件实例ID - * @return sandbox_id 沙箱ID,未找到返0 - */ - uint32_t get_sandbox_id(uint64_t instance_id) const; - - /** - * @brief 获取沙箱信息 - * @param sandbox_id 沙箱ID - * @return 沙箱实例信息,未找到返回nullptr - */ - const sandbox_instance* get_sandbox_info(uint32_t sandbox_id) const; - - /** - * @brief 获取沙箱的共享内存名称 - * @param sandbox_id 沙箱ID - * @return 共享内存名称,未找到返回空字符串 - */ - std::string get_shm_name(uint32_t sandbox_id) const; - - /** - * @brief 设置沙箱配置 - */ - void set_config(const sandbox_config& config); - - /** - * @brief 获取当前配置 - */ - const sandbox_config& get_config() const { return config_; } - - /** - * @brief 获取所有沙箱ID列 - */ - std::vector get_all_sandbox_ids() const; - - /** - * @brief 获取沙箱数量 - */ - size_t sandbox_count() const; - -private: - sandbox_manager() = default; - ~sandbox_manager() override; - - /** - * @brief 创建新沙箱 - */ - uint32_t create_sandbox( - plugin_type type, - const std::filesystem::path& plugin_path - ); - - /** - * @brief 销毁沙箱 - */ - bool destroy_sandbox(uint32_t sandbox_id); - - /** - * @brief 判断是否应该共享沙箱 - */ - bool should_share_sandbox(uint64_t plugin_id, plugin_type type) const; - - /** - * @brief 查找可共享的沙箱 - */ - uint32_t find_shareable_sandbox(uint64_t plugin_id, plugin_type type) const; - - /** - * @brief 生成共享内存名称 - */ - std::string generate_shm_name(uint32_t sandbox_id) const; - - /** - * @brief 处理沙箱崩溃 - */ - void handle_sandbox_crash(uint32_t sandbox_id); - - /** - * @brief 重启沙箱 - */ - bool restart_sandbox(uint32_t sandbox_id); - - /** - * @brief 进程错误回调 - */ - void on_process_error( - const process_info& info, - process_error error, - const std::string& error_message - ); - -private: - // 沙箱映射表:sandbox_id -> sandbox_instance - std::unordered_map> sandboxes_; - - // 插件实例映射表:instance_id -> sandbox_id - std::unordered_map instance_to_sandbox_; - - // 插件ID映射表:plugin_id -> sandbox_id (用于查找可共享沙箱) - std::unordered_map> plugin_to_sandboxes_; - - // 配置 - sandbox_config config_; - - // 下一个沙箱ID - uint32_t next_sandbox_id_{1}; - - // 互斥锁 - mutable std::mutex mutex_; -}; - -} // namespace alicho -``` - -### 2.2 AudioRouter(音频路器) - -#### 2.2.1 职责 - -`AudioRouter` 负责管理多个沙箱之间的音频数据流: - -1. **音频路由**:根路由图将音频数据路由到正确的沙箱 -2. **混音处理**:混多个音频源 -3. **缓冲管理**:管每个沙箱的音频缓冲区 -4. **同步控制**:确音频处理的时序正确 - -#### 2.2.2 类设计 - -**文件位置**:[`src/engine/audio/audio_router.h`](../src/engine/audio/audio_router.h) (新建) - -```cpp -#pragma once -#include -#include -#include -#include "lazy_singleton.h" -#include "audio_processing_shm.h" -#include "aligned_allocator.h" - -namespace alicho { - -/** - * @brief 音频连接:定义从源到目标的音频路由 - */ -struct audio_connection { - uint32_t source_sandbox_id; ///< 源沙箱ID(0表示Engine输入) - uint32_t dest_sandbox_id; ///< 目标沙箱ID(0表示Engine输出) - float gain = 1.0f; ///< 增益系数 - bool enabled = true; ///< 是否启用 -}; - -/** - * @brief 音频路由图 - */ -struct audio_routing_graph { - std::vector connections; - - /** - * @brief 添加连接 - */ - void add_connection(uint32_t source, uint32_t dest, float gain = 1.0f); - - /** - * @brief 移除连接 - */ - void remove_connection(uint32_t source, uint32_t dest); - - /** - * @brief 清除所有连接 - */ - void clear(); - - /** - * @brief 验证路由图是否有效(无环路) - */ - bool validate() const; -}; - -/** - * @brief 沙箱音频上下文 - */ -struct sandbox_audio_context { - uint32_t sandbox_id; - std::unique_ptr audio_task; - std::vector> input_buffer; - std::vector> output_buffer; - std::vector input_ptrs; - std::vector output_ptrs; - - void setup(uint32_t num_channels, uint32_t buffer_size); -}; - -/** - * @brief 音频路由器 - * - * 管理Engine与多个沙箱之间的音频路由和混音 - */ -class audio_router : public lazy_singleton { - friend class lazy_singleton; - -public: - /** - * @brief 注册沙箱 - * @param sandbox_id 沙箱ID - * @param shm_name 共享内存名称 - * @return 是否成功 - */ - bool register_sandbox(uint32_t sandbox_id, const std::string& shm_name); - - /** - * @brief 注销沙箱 - * @param sandbox_id 沙箱ID - */ - void unregister_sandbox(uint32_t sandbox_id); - - /** - * @brief 设置路由图 - * @param graph 路由图 - * @return 是否成功(图验证失败会返回false) - */ - bool set_routing_graph(const audio_routing_graph& graph); - - /** - * @brief 获取当前路由图 - */ - const audio_routing_graph& get_routing_graph() const { return routing_graph_; } - - /** - * @brief 处理音频帧 - * @param engine_input Engine的输入音频 - * @param engine_output Engine的输出音频 - * @param num_channels 通道数 - * @param num_samples 样本数 - * @return 是否成 - * - * 此方法执行完整的音频路由流程: - * 1. 根据路由图将engine_input分发到各沙箱 - * 2. 等待沙箱处理完成 - * 3. 收集沙箱输出并混音到engine_output - */ - bool process_audio_frame( - const float* const* engine_input, - float** engine_output, - uint32_t num_channels, - uint32_t num_samples - ); - - /** - * @brief 配置音频参数 - */ - void configure(uint32_t num_channels, uint32_t buffer_size, uint32_t sample_rate); - -private: - audio_router() = default; - ~audio_router() override; - - /** - * @brief 分发音频到沙箱 - */ - void distribute_to_sandboxes( - const float* const* source_audio, - uint32_t num_channels, - uint32_t num_samples - ); - - /** - * @brief 从沙箱收集并混音 - */ - void collect_and_mix( - float** dest_audio, - uint32_t num_channels, - uint32_t num_samples - ); - - /** - * @brief 混音多个音频源 - */ - void mix_audio( - float** dest, - const std::vector& sources, - const std::vector& gains, - uint32_t num_channels, - uint32_t num_samples - ); - -private: - // 沙箱音频上下文映射表 - std::unordered_map> contexts_; - - // 路由图 - audio_routing_graph routing_graph_; - - // 音频配置 - uint32_t num_channels_{2}; - uint32_t buffer_size_{512}; - uint32_t sample_rate_{48000}; - - // 互斥锁 - mutable std::mutex mutex_; -}; - -} // namespace alicho -``` - -### 2.3 进程管理增强 - -现有的 [`process_manager`](../src/process_manager/process_manager.h:21) 已经支持多进程管理,因此**无需大幅修改**。 - -#### 2.3.1 需要的调整 - -1. **进程回调增强**确保 `SandboxManager` 能够监听进程状态变化 -2. **进程标识**:为箱进程添加额外的元数据(sandbox_id) - -**修改文件**:[`src/process_manager/process_types.h`](../src/process_manager/process_types.h) - -```cpp -// 在 process_info 结构体中添加: -struct process_info { - // ... 现有字段 ... - - // 新增:沙箱相关信息 - uint32_t sandbox_id = 0; ///< 沙箱ID(0表示非沙箱进程) - std::string sandbox_shm_name; ///< 沙箱的共享内存名称 -}; -``` - -### 2.4 RPC 通信改造 - -#### 2.4.1 增 RPC 消息类型 - -**修改文件**:[`src/network/rpc/host_rpc.h`](../src/network/rpc/host_rpc.h) - -```cpp -namespace host_rpc { - -// 现有的 setup_t 和 shutdown_t ... - -/** - * @brief 加载插件请求 - */ -struct load_plugin_t { - uint64_t instance_id; ///< 插件实例ID - std::string plugin_path; ///< 插件文件路径 - plugin_type type; ///< 插件类型 -}; - -/** - * @brief 加载插件响应 - */ -struct load_plugin_response_t { - bool success; - uint64_t instance_id; - std::string error_message; -}; - -/** - * @brief 卸载插件请求 - */ -struct unload_plugin_t { - uint64_t instance_id; ///< 插件实例ID -}; - -/** - * @brief 卸载插件响应 - */ -struct unload_plugin_response_t { - bool success; - uint64_t instance_id; -}; - -/** - * @brief 插件状态查询请求 - */ -struct plugin_status_query_t { - uint64_t instance_id; -}; - -/** - * @brief 插件状态查询响应 - */ -struct plugin_status_response_t { - uint64_t instance_id; - bool loaded; - bool processing; - std::string status_message; -}; - -} // namespace host_rpc -``` - -#### 2.4.2 RPC 路由机制 - -**修改文件**:[`src/engine/plugin/rpc_router.h`](../src/engine/plugin/rpc_router.h) (新建) - -```cpp -#pragma once -#include -#include "lazy_singleton.h" -#include "zmq_server.h" - -namespace alicho { - -/** - * @brief RPC 路由器 - * - * 负责将RPC消息路由到正确的沙箱客户端 - */ -class rpc_router : public lazy_singleton { - friend class lazy_singleton; - -public: - /** - * @brief 注册沙箱的RPC客户端ID - * @param sandbox_id 沙箱ID - * @param client_id ZMQ客户端ID - */ - void register_sandbox_client(uint32_t sandbox_id, uint32_t client_id); - - /** - * @brief 注销沙箱 - */ - void unregister_sandbox_client(uint32_t sandbox_id); - - /** - * @brief 获取沙箱的客户端ID - * @param sandbox_id 沙箱ID - * @return client_id,未找到返回0 - */ - uint32_t get_client_id(uint32_t sandbox_id) const; - - /** - * @brief 向指定沙箱发送消息 - * @tparam T 消息类型 - * @param sandbox_id 沙箱ID - * @param message 消息对象 - * @return 是否成功 - */ - template - bool send_to_sandbox(uint32_t sandbox_id, const T& message) { - auto client_id = get_client_id(sandbox_id); - if (client_id == 0) { - return false; - } - zmq_server::instance().send(client_id, message); - return true; - } - -private: - rpc_router() = default; - - // 沙箱ID -> ZMQ客户端ID 映射表 - std::unordered_map sandbox_to_client_; - std::unordered_map client_to_sandbox_; - - mutable std::mutex mutex_; -}; - -} // namespace alicho -``` - -### 2.5 共享内存改造 - -#### 2.5.1 命名策略 - -为每个沙箱创建独立的共享内存段,命名规则: - -```cpp -std::string generate_shm_name(uint32_t sandbox_id) { -#ifdef _WIN32 - return "Local\\alicho_sandbox_" + std::to_string(sandbox_id); -#else - return "/alicho_sandbox_" + std::to_string(sandbox_id); -#endif -} -``` - -#### 2.5.2 生命周期管理 - -- **创建时机**:`SandboxManager::create_sandbox()` 时创建 -- **销毁时机**:`SandboxManager::destroy_sandbox()` 时销毁 -- **所有权**:Engine 进程负责创建和销毁,沙箱进程仅连接 - -#### 2.5.3 内存泄漏预防 - -**修改文件**:[`src/network/shm/shared_memory_manager.h`](../src/network/shm/shared_memory_manager.h) - -添加清理功能: - -```cpp -/** - * @brief 清理指定名称的共享内存段(即使当前未打开) - * @param shm_name 共享内存名称 - * @return 是否成功 - */ -static bool cleanup_shared_memory(const std::string& shm_name); - -/** - * @brief 清理所有alicho相关的共享内存段 - * 用于启动时清理残留的共享内存 - */ -static void cleanup_all_alicho_shm(); -``` - ---- - -## 3. 数据流设计 - -### 3.1 插加载流程 - -```mermaid -sequenceDiagram - participant User - participant PluginManager - participant SandboxMgr as SandboxManager - participant ProcessMgr as ProcessManager - participant AudioRouter - participant RPCRouter - participant Sandbox as 沙箱进程 - - User->>PluginManager: 加载插件(path, type) - PluginManager->>SandboxMgr: acquire_sandbox(plugin_id, instance_id, path, type) - - alt 需要创建新沙箱 - SandboxMgr->>SandboxMgr: generate_shm_name(sandbox_id) - SandboxMgr->>ProcessMgr: launch_process(executable, args) - ProcessMgr->>Sandbox: 启动沙箱进程 - Sandbox->>RPCRouter: 连接RPC(client_id) - RPCRouter->>SandboxMgr: 通知沙箱已连接 - SandboxMgr->>Sandbox: 发送 setup_t(shm_name) - Sandbox->>Sandbox: 连接共享内存 - Sandbox->>SandboxMgr: 返回初始化成功 - SandboxMgr->>AudioRouter: register_sandbox(sandbox_id, shm_name) - else 使用现有沙箱 - SandboxMgr->>SandboxMgr: find_shareable_sandbox() - end - - SandboxMgr->>Sandbox: 发送 load_plugin_t(instance_id, path) - Sandbox->>Sandbox: 加载插件 - Sandbox->>SandboxMgr: 返回 load_plugin_response_t(success) - SandboxMgr->>PluginManager: 返回 sandbox_id - PluginManager->>User: 返回插件实例 -``` - -### 3.2 音频处理流程 - -```mermaid -sequenceDiagram - participant Engine - participant AudioRouter - participant SHM1 as 共享内存1 - participant Sandbox1 as 沙箱1 - participant SHM2 as 共享内存2 - participant Sandbox2 as 沙箱2 - - Engine->>AudioRouter: process_audio_frame(input, output) - - par 并行分发到沙箱 - AudioRouter->>SHM1: 写入音频数据 - AudioRouter->>SHM1: post(request_semaphore) - AudioRouter->>SHM2: 写入音频数据 - AudioRouter->>SHM2: post(request_semaphore) - end - - par 沙箱并行处理 - Sandbox1->>SHM1: wait(request_semaphore) - Sandbox1->>Sandbox1: 处理音频 - Sandbox1->>SHM1: 写入处理结果 - Sandbox1->>SHM1: post(response_semaphore) - - Sandbox2->>SHM2: wait(request_semaphore) - Sandbox2->>Sandbox2: 处理音频 - Sandbox2->>SHM2: 写入处理结果 - Sandbox2->>SHM2: post(response_semaphore) - end - - par 并行收集结果 - AudioRouter->>SHM1: wait(response_semaphore) - AudioRouter->>SHM1: 读取处理结果 - - AudioRouter->>SHM2: wait(response_semaphore) - AudioRouter->>SHM2: 读取处理结果 - end - - AudioRouter->>AudioRouter: 混音所有结果 - AudioRouter->>Engine: 返回混音后的输出 -``` - -### 3.3 插件卸载流程 - -```mermaid -sequenceDiagram - participant User - participant PluginManager - participant SandboxMgr as SandboxManager - participant Sandbox as 沙箱进程 - participant AudioRouter - participant ProcessMgr as ProcessManager - - User->>PluginManager: 卸载插件(instance_id) - PluginManager->>SandboxMgr: release_sandbox(instance_id) - - SandboxMgr->>Sandbox: 发送 unload_plugin_t(instance_id) - Sandbox->>Sandbox: 卸载插件 - Sandbox->>SandboxMgr: 返回 unload_plugin_response_t - - SandboxMgr->>SandboxMgr: 从映射表中移除instance_id - - alt 沙箱中无其他插件 - SandboxMgr->>AudioRouter: unregister_sandbox(sandbox_id) - SandboxMgr->>Sandbox: 发送 shutdown_t - Sandbox->>Sandbox: 清理资源 - Sandbox->>Sandbox: 退出进程 - SandboxMgr->>ProcessMgr: 等待进程结束 - SandboxMgr->>SandboxMgr: 清理共享内存 - SandboxMgr->>SandboxMgr: 从沙箱表中移除 - end - - SandboxMgr->>PluginManager: 返回成功 - PluginManager->>User: 插件已卸载 -``` - -### 3.4 崩溃恢复流程 - -```mermaid -sequenceDiagram - participant Sandbox as 沙箱进程 - participant ProcessMonitor as 进程监控器 - participant SandboxMgr as SandboxManager - participant AudioRouter - participant User - - Sandbox->>Sandbox: 插件崩溃 - Sandbox->>Sandbox: 进程异常退出 - - ProcessMonitor->>ProcessMonitor: 检测到进程退出 - ProcessMonitor->>SandboxMgr: on_process_error(sandbox_id) - - SandboxMgr->>SandboxMgr: handle_sandbox_crash(sandbox_id) - - alt recovery_policy = NOTIFY_ONLY - SandboxMgr->>User: 通知插件崩溃 - SandboxMgr->>AudioRouter: unregister_sandbox(sandbox_id) - SandboxMgr->>SandboxMgr: 标记沙箱为失败状态 - else recovery_policy = AUTO_RESTART - alt restart_count < max_attempts - SandboxMgr->>SandboxMgr: restart_sandbox(sandbox_id) - SandboxMgr->>ProcessMonitor: launch_process() - ProcessMonitor->>Sandbox: 重启沙箱进程 - Sandbox->>SandboxMgr: 重新连接 - SandboxMgr->>Sandbox: 重新加载插件 - - alt 重启成功 - SandboxMgr->>AudioRouter: 重新注册沙箱 - SandboxMgr->>User: 通知恢复成功 - else 重启失败 - SandboxMgr->>User: 通知恢复失败 - SandboxMgr->>AudioRouter: unregister_sandbox() - end - else 超过重启次数 - SandboxMgr->>User: 通知多次崩溃,放弃恢复 - SandboxMgr->>AudioRouter: unregister_sandbox() - end - end -``` - ---- - -## 4. 接口定义 - -### 4.1 PluginManager 扩展接口 - -**修改文件**:[`src/engine/plugin/host_manager.h`](../src/engine/plugin/host_manager.h) - -将 `host_manager` 重命名为 `plugin_manager` 扩展功能: - -```cpp -#pragma once -#include "lazy_singleton.h" -#include "sandbox_manager.h" -#include "plugin_type.h" -#include -#include - -namespace alicho { - -/** - * @brief 插件实例信息 - */ -struct plugin_instance_info { - uint64_t instance_id; ///< 实例唯一ID - uint64_t plugin_id; ///< 插件ID - std::string name; ///< 插件名称 - plugin_type type; ///< 插件类型 - uint32_t sandbox_id; ///< 所在沙箱ID - bool is_active; ///< 是否活跃 - std::chrono::system_clock::time_point load_time; -}; - -/** - * @brief 插件管理器(原host_manager的扩展版本) - */ -class plugin_manager : public lazy_singleton { - friend class lazy_singleton; - -public: - /** - * @brief 加载插件 - * @param plugin_path 插件文件路径 - * @param type 插件类型 - * @return instance_id 插件实例ID,失败回0 - */ - uint64_t load_plugin( - const std::filesystem::path& plugin_path, - plugin_type type - ); - - /** - * @brief 卸载插件 - * @param instance_id 插件实例ID - * @return 是否成功 - */ - bool unload_plugin(uint64_t instance_id); - - /** - * @brief 获取插件实例信息 - */ - std::optional get_plugin_info(uint64_t instance_id) const; - - /** - * @brief 获取所有已加载的插件实例ID - */ - std::vector get_all_plugin_instances() const; - - /** - * @brief 设置沙箱配置 - */ - void set_sandbox_config(const sandbox_config& config); - - /** - * @brief 设置音频路由图 - */ - bool set_audio_routing(const audio_routing_graph& graph); - -private: - plugin_manager() = default; - - uint64_t generate_instance_id(); - uint64_t calculate_plugin_id(const std::filesystem::path& path) const; - -private: - std::unordered_map instances_; - uint64_t next_instance_id_{1}; - mutable std::mutex mutex_; -}; - -} // namespace alicho -``` - -### 4.2 RPC 消息处理器 - -**新建文件**:[`src/host_sandbox/common/rpc/plugin_rpc_handlers.cpp`](../src/host_sandbox/common/rpc/plugin_rpc_handlers.cpp) - -```cpp -#include "zmq_client.h" -#include "host_rpc.h" -#include "logger.h" - -// 加载插件请求处理器 -ZMQ_CLIENT_REGISTER_PROCESSOR(host_rpc::load_plugin_t) { - host_rpc::load_plugin_response_t response; - response.instance_id = data.instance_id; - - try { - // 根据插件类型加载插件 - // 实际实现在具体的host_app派生类中 - bool success = load_plugin_impl(data.instance_id, data.plugin_path, data.type); - response.success = success; - if (!success) { - response.error_message = "Failed to load plugin"; - } - } catch (const std::exception& e) { - response.success = false; - response.error_message = e.what(); - } - - // 发送响应 - zmq_client::instance().send(response); -} - -// 卸载插件请求处理器 -ZMQ_CLIENT_REGISTER_PROCESSOR(host_rpc::unload_plugin_t) { - host_rpc::unload_plugin_response_t response; - response.instance_id = data.instance_id; - - try { - bool success = unload_plugin_impl(data.instance_id); - response.success = success; - } catch (const std::exception& e) { - response.success = false; - } - - zmq_client::instance().send(response); -} -``` - ---- - -## 5. 实施策略 - -### 5.1 实施优先级 - -按照以下优先级顺序实施,确保每个阶段都有可运行的系统: - -#### 阶段 1:基础架构(高先级) - -1. ✅ **SandboxManager 基础实现** - - 文件:[`src/engine/plugin/sandbox_manager.h`](../src/engine/plugin/sandbox_manager.h) - - 文件:[`src/engine/plugin/sandbox_manager.cpp`](../src/engine/plugin/sandbox_manager.cpp) - - 功能:沙箱创建、销毁、基本映射管理 - - 测试:单元测试创建和销毁沙箱 - -2. ✅ **RPC 消息扩展** - - 文件:[`src/network/rpc/host_rpc.h`](../src/network/rpc/host_rpc.h) - - 功能:新增 `load_plugin_t`、`unload_plugin_t` 等消息类型 - - 测试:RPC 消息序列化测试 - -3. ✅ **RPCRouter 实现** - - 文件:[`src/engine/plugin/rpc_router.h`](../src/engine/plugin/rpc_router.h) - - 文件:[`src/engine/plugin/rpc_router.cpp`](../src/engine/plugin/rpc_router.cpp) - - 功能:消息路由到正确的沙箱 - - 测试:多客户端路由测试 - -#### 阶段 2:插件管理(高先级) - -4. ✅ **PluginManager 重构** - - 文件:[`src/engine/plugin/host_manager.h`](../src/engine/plugin/host_manager.h) → `plugin_manager.h` - - 文件:[`src/engine/plugin/host_manager.cpp`](../src/engine/plugin/host_manager.cpp) → `plugin_manager.cpp` - - 功能:集成 SandboxManager,实现整的插件加载/卸载流程 - - 测试:端到端插件加载测试 - -5. ✅ **共享内存生命周管理** - - 文件:[`src/network/shm/shared_memory_manager.h`](../src/network/shm/shared_memory_manager.h) - - 功能:添加清理函数,防止内存泄漏 - - 测试:内存泄漏检测测试 - -#### 阶段 3:音频路由(中先级) - -6. ⚠️ **AudioRouter 基础实现** - - 文件:[`src/engine/audio/audio_router.h`](../src/engine/audio/audio_router.h) - - 文件:[`src/engine/audio/audio_router.cpp`](../src/engine/audio/audio_router.cpp) - - 功能:简单的并行分发和混音 - - 测试:基础音频路由测试 - -7. ⚠️ **路由图管理** - - 文件:同上 - - 功能:动态路由图、环路检测 - - 测试:复杂路由图测试 - -#### 阶段 4:高级特性(低先级) - -8. 📋 **沙箱共享策略** - - 文件:[`src/engine/plugin/sandbox_manager.cpp`](../src/engine/plugin/sandbox_manager.cpp) - - 功能:实现插件共享逻辑 - - 测试:共享策略测试 - -9. 📋 **崩溃恢复机制** - - 文件:[`src/engine/plugin/sandbox_manager.cpp`](../src/engine/plugin/sandbox_manager.cpp) - - 功能:自动重启、通知机制 - - 测试:崩溃恢复测试 - -10. 📋 **性能优化** - - 所有组件 - - 功能:减少延迟、优化内存使用 - - 测试:性能基准测试 - -### 5.2 向后兼容性 - -根据求,**不需要保留向兼容性**,但建议: - -- ✅ 保留旧的 API 签名,内部重构为新实现 -- ✅ 提供迁移脚本或文档 -- ✅ 在过渡期提供配置开关(可选) - -### 5.3 测试策略 - -#### 单元测试 - -针对每个组件编写独立的单元测试: - -```cpp -// tests/engine/test_sandbox_manager.cpp -TEST(SandboxManagerTest, CreateAndDestroySandbox) { - auto& mgr = sandbox_manager::instance(); - - uint32_t sandbox_id = mgr.create_sandbox( - plugin_type::VST2, - "path/to/plugin.dll" - ); - - EXPECT_NE(sandbox_id, 0); - EXPECT_TRUE(mgr.destroy_sandbox(sandbox_id)); -} - -TEST(SandboxManagerTest, SharingPolicy) { - auto& mgr = sandbox_manager::instance(); - sandbox_config config; - config.sharing_policy = sandbox_sharing_policy::SHARED; - mgr.set_config(config); - - // 加载两个相同插件的实例 - uint32_t sb1 = mgr.acquire_sandbox(1, 101, "plugin.dll", plugin_type::VST2); - uint32_t sb2 = mgr.acquire_sandbox(1, 102, "plugin.dll", plugin_type::VST2); - - // 应该共享同一个沙箱 - EXPECT_EQ(sb1, sb2); -} -``` - -#### 集成测试 - -测试组件之间的协作: - -```cpp -// tests/integration/test_plugin_lifecycle.cpp -TEST(PluginLifecycleTest, LoadAndUnloadPlugin) { - auto& pm = plugin_manager::instance(); - - uint64_t instance_id = pm.load_plugin("plugin.dll", plugin_type::VST2); - EXPECT_NE(instance_id, 0); - - auto info = pm.get_plugin_info(instance_id); - ASSERT_TRUE(info.has_value()); - EXPECT_EQ(info->type, plugin_type::VST2); - - EXPECT_TRUE(pm.unload_plugin(instance_id)); -} -``` - -#### 压力测试 - -```cpp -// tests/stress/test_multi_sandbox.cpp -TEST(StressTest, Load100Plugins) { - auto& pm = plugin_manager::instance(); - std::vector instances; - - for (int i = 0; i < 100; ++i) { - uint64_t id = pm.load_plugin("plugin.dll", plugin_type::VST2); - EXPECT_NE(id, 0); - instances.push_back(id); - } - - // 验证所有插件都成功加载 - EXPECT_EQ(instances.size(), 100); - - // 卸载所有插件 - for (uint64_t id : instances) { - EXPECT_TRUE(pm.unload_plugin(id)); - } -} -``` - -### 5.4 潜在风险与缓解措施 - -| 风险 | 影响 | 概率 | 缓解措施 | -|------|------|------|----------| -| **多进程同步问题** | 高 | 中 | - 使用成熟的同步原语(信号量)
- 完善的单元测试
- 添加超时机制 | -| **共享内存泄漏** | 中 | 中 | - 实现自动清理机制
- 进程退出时清理
- 启动时扫描并清理残留 | -| **沙箱进程僵尸** | 中 | 低 | - 使用 process_monitor 监控
- 定期健康检查
- 超时强制终止 | -| **音频延迟增加** | 高 | 中 | - 性能基准测试
- 优化共享内存访问
- 考虑使用零拷贝技术 | -| **路由图环路** | 低 | 低 | - 实现环路检测算法
- 设置最大路由深度
- 用户界面提示 | -| **崩溃恢复失败** | 中 | 中 | - 实现重试机制
- 提供降级策略
- 详细的错误日志 | - ---- - -## 6. 性能考虑 - -### 6.1 多进程开销分析 - -#### 6.1.1 进程创建开销 - -- **创建时间**:Windows 约 50-100ms,Linux 约 10-50ms -- **内存占用**:每沙箱进程约 10-50MB(取决于加的插件) -- **缓解措施**: - - 延迟创建:仅在用户实际加载插件时创建沙箱 - - 进程池:预创建若干沙箱进程(可选,低优先级) - - 共享策略:相同插件共享沙箱 - -#### 6.1.2 进程间通信开销 - -| 通信方式 | 延迟 | 吞吐量 | 使用场景 | -|----------|------|--------|----------| -| **共享内存** | < 1μs | 极高 | 音频数据传输 | -| **ZMQ (RPC)** | 10-100μs | 中等 | 控制消息 | -| **信号量** | < 1μs | N/A | 同步信号 | - -### 6.2 内存使用评估 - -#### 6.2.1 Engine 进程内存 - -``` -基础内存消耗: -- SandboxManager: ~100 KB -- AudioRouter: ~500 KB (取决于沙箱数量) -- RPCRouter: ~50 KB - -每个沙箱的映射数据: ~1 KB -每个音频上下文: ~100 KB (2通道 × 512样本 × 多个缓冲区) - -总计(10个沙箱) ~1.7 MB -``` - -#### 6.2.2 沙箱进程内存 - -``` -每个沙箱进程: -- host_app 框架: ~5 MB -- RPC 客户端: ~2 MB -- 共享内存映射: ~2 MB -- 插件本身: 10-100 MB (取决于插件) - -总计: 20-110 MB -``` - -#### 6.2.3 共享内存段 - -``` -每个共享内存段大小: -- audio_processing_shm_state: ~1 KB -- 输入 Ring Buffer: 配置决定 (默认 ~500 KB) -- 输 Ring Buffer: 配置决定 (默认 ~500 KB) - -总计(每个沙箱): ~1 MB -``` - -### 6.3 延迟影响分析 - -#### 6.3.1 音频处理路径延迟 - -``` -传统单沙箱架构: -Engine → 共享内存 → 单一沙箱 → 共享内存 → Engine -延迟:约 1-2ms (512 samples @ 48kHz) - -新多沙箱架构(并行): -Engine → [分发] → 多个沙箱(并行处理) → [收集+混音] → Engine -延迟:约 1.5-3ms (主要是同步开销) - -延迟增加:约 0.5-1ms -``` - -#### 6.3.2 延迟优化策略 - -1. **零拷贝技术**: - ```cpp - // 使用零拷贝API直接操作共享内存 - auto* frame = audio_task.acquire_send_frame(); - // 直接写入frame->data无需额外拷贝 - memcpy(frame->data, source, size); - audio_task.commit_send_frame(); - ``` - -2. **批处理**: - - 一次处理多个音频帧 - - 减少同步次数 - -3. **优先级调整**: - ```cpp - // 将沙箱进程设置为实时优先级 - #ifdef _WIN32 - SetPriorityClass(process_handle, REALTIME_PRIORITY_CLASS); - #else - struct sched_param param; - param.sched_priority = 99; - pthread_setschedparam(thread, SCHED_FIFO, ¶m); - #endif - ``` - -### 6.4 性能基准测试 - -需要建立以下性能基准: - -```cpp -// tests/benchmark/benchmark_audio_routing.cpp -void BM_AudioRouting(benchmark::State& state) { - int num_sandboxes = state.range(0); - - // 设置测试环境 - setup_sandboxes(num_sandboxes); - - for (auto _ : state) { - // 测量音频处理延迟 - auto start = std::chrono::high_resolution_clock::now(); - router.process_audio_frame(input, output, 2, 512); - auto end = std::chrono::high_resolution_clock::now(); - - auto elapsed = std::chrono::duration_cast( - end - start - ); - state.SetIterationTime(elapsed.count() / 1000000.0); - } - - state.SetItemsProcessed(state.iterations() * 512); -} - -BENCHMARK(BM_AudioRouting) - ->Arg(1) // 1个沙箱 - ->Arg(4) // 4个沙箱 - ->Arg(10) // 10个沙箱 - ->Arg(20); // 20个沙箱 -``` - -### 6.5 性能目标 - -| 指标 | 目标值 | 可接受值 | -|------|--------|----------| -| **插件加载时间** | < 500ms | < 1s | -| **音频处理延迟(插件)** | < 2ms | < 5ms | -| **音频处理延迟(10个插件并行)** | < 3ms | < 8ms | -| **内存占用(10个箱)** | < 500 MB | < 1 GB | -| **CPU 占用(10个插件活)** | < 30% | < 50% | - ---- - -## 附录 - -### A. 需要修改的现有文件清单 - -1. ✏️ [`src/engine/plugin/host_manager.h`](../src/engine/plugin/host_manager.h) - 重命名为 `plugin_manager.h` 并扩展 -2. ✏️ [`src/engine/plugin/host_manager.cpp`](../src/engine/plugin/host_manager.cpp) - 重命名并实现新功能 -3. ✏️ [`src/process_manager/process_types.h`](../src/process_manager/process_types.h) - 添加沙箱元数据字段 -4. ✏️ [`src/network/rpc/host_rpc.h`](../src/network/rpc/host_rpc.h) - 添加新的 RPC 消息类型 -5. ✏️ [`src/network/shm/shared_memory_manager.h`](../src/network/shm/shared_memory_manager.h) - 添加清理函数 -6. ✏️ [`src/host_sandbox/common/host_app.h`](../src/host_sandbox/common/host_app.h) - 支持多插件实例 -7. ✏️ [`src/engine/main.cpp`](../src/engine/main.cpp) - 集成新的插件管理器 - -### B. 需要新建的文件清单 - -1. ➕ [`src/engine/plugin/sandbox_manager.h`](../src/engine/plugin/sandbox_manager.h) -2. ➕ [`src/engine/plugin/sandbox_manager.cpp`](../src/engine/plugin/sandbox_manager.cpp) -3. ➕ [`src/engine/plugin/rpc_router.h`](../src/engine/plugin/rpc_router.h) -4. ➕ [`src/engine/plugin/rpc_router.cpp`](../src/engine/plugin/rpc_router.cpp) -5. ➕ [`src/engine/audio/audio_router.h`](../src/engine/audio/audio_router.h) -6. ➕ [`src/engine/audio/audio_router.cpp`](../src/engine/audio/audio_router.cpp) -7. ➕ [`src/host_sandbox/common/rpc/plugin_rpc_handlers.cpp`](../src/host_sandbox/common/rpc/plugin_rpc_handlers.cpp) -8. ➕ [`tests/engine/test_sandbox_manager.cpp`](../tests/engine/test_sandbox_manager.cpp) -9. ➕ [`tests/engine/test_audio_router.cpp`](../tests/engine/test_audio_router.cpp) -10. ➕ [`tests/integration/test_plugin_lifecycle.cpp`](../tests/integration/test_plugin_lifecycle.cpp) - -### C. 参考资料 - -1. **Boost.Interprocess 文档**:https://www.boost.org/doc/libs/release/doc/html/interprocess.html -2. **ZeroMQ Guide**:https://zguide.zeromq.org/ -3. **实时音频编程指**:https://www.rossbencina.com/code/real-time-audio-programming-101-time-waits-for-nothing -4. **VST SDK 文档**:https://steinbergmedia.github.io/vst3_doc/ - ---- - -**文档结束** - -*本设计文档将根实施过程中的反馈和发现持续更新。* \ No newline at end of file diff --git a/src/engine/audio/audio_router.h b/src/engine/audio/audio_router.h index 2c99d1e..419e158 100644 --- a/src/engine/audio/audio_router.h +++ b/src/engine/audio/audio_router.h @@ -40,6 +40,7 @@ #include #include #include +#include // 前向声明 class audio_processing_task; diff --git a/src/engine/plugin/plugin_database.h b/src/engine/plugin/plugin_database.h index b84466c..2259387 100644 --- a/src/engine/plugin/plugin_database.h +++ b/src/engine/plugin/plugin_database.h @@ -4,6 +4,7 @@ #include "lazy_singleton.h" #include +#include #include "plugin_type.h" diff --git a/src/host_sandbox/common/host_app.h b/src/host_sandbox/common/host_app.h index e5481d2..cc15cd7 100644 --- a/src/host_sandbox/common/host_app.h +++ b/src/host_sandbox/common/host_app.h @@ -2,6 +2,7 @@ #include #include #include +#include #include "aligned_allocator.h" #include "transport/audio_processing_shm.h" diff --git a/src/misc/lib_handle.h b/src/misc/lib_handle.h new file mode 100644 index 0000000..708f5ba --- /dev/null +++ b/src/misc/lib_handle.h @@ -0,0 +1,39 @@ +#pragma once +#include +#include + +class lib_handle { +public: + lib_handle() = default; + ~lib_handle() { close(); } + + auto open(const std::filesystem::path& lib_path) -> bool; + void close(); + + // 通过函数名称和函数签名获取函数指针 + template + auto get_function_by_name(const std::string& func_name) -> std::function { + auto raw_func_ptr = get_raw_function(func_name); + if (!raw_func_ptr) { + return nullptr; + } + + // 将原始函数指针转换为函数指针类型 + // Func 是函数签名,如 int(float, double) + // Func* 是对应的函数指针类型,如 int(*)(float, double) + using func_ptr_type = Func*; + auto typed_func_ptr = reinterpret_cast(raw_func_ptr); + + return std::function(typed_func_ptr); + } +private: + auto get_raw_function(const std::string& func_name) -> void*; + void* handle_{nullptr}; +}; + +// 通过函数签名自动推导类型并获取函数 +// 用法示例:auto func = get_function_by_func_signature(handle, my_function); +// 其中 my_function 是实际的函数名称 +#define get_function_by_func_signature(lib_handle_instance, func_signature) \ + (lib_handle_instance).get_function_by_name>(#func_signature) + diff --git a/src/misc/linux/lib_handle.cpp b/src/misc/linux/lib_handle.cpp new file mode 100644 index 0000000..05edf24 --- /dev/null +++ b/src/misc/linux/lib_handle.cpp @@ -0,0 +1,24 @@ +#include "lib_handle.h" + +#include + +auto lib_handle::open(const std::filesystem::path& lib_path) -> bool { + close(); + + handle_ = dlopen(lib_path.c_str(), RTLD_LAZY); + return handle_ != nullptr; +} + +void lib_handle::close() { + if (handle_) { + dlclose(handle_); + handle_ = nullptr; + } +} + +auto lib_handle::get_raw_function(const std::string& func_name) -> void* { + if (!handle_) { + return nullptr; + } + return dlsym(handle_, func_name.c_str()); +} diff --git a/src/misc/linux/thread_tool.cpp b/src/misc/linux/thread_tool.cpp index 8ee0863..a62ea25 100644 --- a/src/misc/linux/thread_tool.cpp +++ b/src/misc/linux/thread_tool.cpp @@ -16,41 +16,14 @@ #include "thread_tool.h" +#include +#include +#include +#include +#include "logger.h" + /** * @brief 设置线程CPU亲和性(Linux占位实现) - * - * 当前返回false表示功能未实现。 - * - * ## 计划实现 - * - * 完整实现应该使用pthread_setaffinity_np或sched_setaffinity: - * - * @code - * #include - * #include - * - * bool thread_set_affinity(boost::thread& thread, int core_id) { - * cpu_set_t cpuset; - * CPU_ZERO(&cpuset); // 清空CPU集合 - * CPU_SET(core_id, &cpuset); // 设置指定的CPU核心 - * - * // 使用pthread API设置亲和性 - * int result = pthread_setaffinity_np( - * thread.native_handle(), // pthread线程句柄 - * sizeof(cpu_set_t), // CPU集合大小 - * &cpuset // CPU集合指针 - * ); - * - * if (result != 0) { - * log_module_error(THREAD_TOOL_LOG_MODULE, - * "无法设置线程亲和性到核心{}: {}", - * core_id, strerror(result)); - * return false; - * } - * return true; - * } - * @endcode - * * ### API说明 * - cpu_set_t:CPU集合类型,表示一组CPU核心 * - CPU_ZERO:清空CPU集合的所有位 @@ -70,47 +43,30 @@ * @todo 实现实际的线程亲和性设置功能 */ bool thread_set_affinity(boost::thread& thread, int core_id) { - // Linux implementation can be added here - // TODO: 使用pthread_setaffinity_np或sched_setaffinity实现 - return false; // Placeholder - 占位实现,表示功能未实现 + // 创建CPU集合 + cpu_set_t cpuset; + CPU_ZERO(&cpuset); // 清空CPU集合 + CPU_SET(core_id, &cpuset); // 设置指定的CPU核心 + + // 使用pthread API设置亲和性 + const int result = pthread_setaffinity_np( + thread.native_handle(), // pthread线程句柄 + sizeof(cpu_set_t), // CPU集合大小 + &cpuset // CPU集合指针 + ); + + if (result != 0) { + log_module_error(THREAD_TOOL_LOG_MODULE, + "无法将线程亲和性设置为核心{}: {}", + core_id, strerror(result)); + return false; + } + + return true; } /** * @brief 设置线程名称(Linux占位实现) - * - * 当前返回false表示功能未实现。 - * - * ## 计划实现 - * - * 完整实现应该使用pthread_setname_np: - * - * @code - * #include - * #include - * - * bool thread_set_name(boost::thread& thread, const char* name) { - * // Linux限制线程名称最长为15个字符(不含null终止符) - * // 因此需要截断过长的名称 - * char truncated_name[16]; // 15字符 + null终止符 - * strncpy(truncated_name, name, 15); - * truncated_name[15] = '\0'; - * - * // 使用pthread API设置线程名称 - * int result = pthread_setname_np( - * thread.native_handle(), // pthread线程句柄 - * truncated_name // 线程名称(最长15字符) - * ); - * - * if (result != 0) { - * log_module_error(THREAD_TOOL_LOG_MODULE, - * "无法设置线程名称为 {}: {}", - * name, strerror(result)); - * return false; - * } - * return true; - * } - * @endcode - * * ### API说明 * - pthread_setname_np:Linux特定的线程命名API * - 线程名称限制为15个字符(不包括null终止符) @@ -138,8 +94,24 @@ bool thread_set_affinity(boost::thread& thread, int core_id) { * @todo 添加名称长度检查和截断逻辑 */ bool thread_set_name(boost::thread& thread, const char* name) { - // Linux implementation can be added here - // TODO: 使用pthread_setname_np实现 - // 注意:Linux限制线程名称最长为15个字符 - return false; // Placeholder - 占位实现,表示功能未实现 + // Linux限制线程名称最长为15个字符(不含null终止符) + // 因此需要截断过长的名称 + char truncated_name[16]; // 15字符 + null终止符 + strncpy(truncated_name, name, 15); + truncated_name[15] = '\0'; + + // 使用pthread API设置线程名称 + const int result = pthread_setname_np( + thread.native_handle(), // pthread线程句柄 + truncated_name // 线程名称(最长15字符) + ); + + if (result != 0) { + log_module_error(THREAD_TOOL_LOG_MODULE, + "无法设置线程名称为 {}: {}", + name, strerror(result)); + return false; + } + + return true; } diff --git a/src/misc/macos/lib_handle.cpp b/src/misc/macos/lib_handle.cpp new file mode 100644 index 0000000..cbfb11e --- /dev/null +++ b/src/misc/macos/lib_handle.cpp @@ -0,0 +1,203 @@ +/** + * @file lib_handle.cpp + * @brief macOS平台动态库加载工具实现 + * + * 实现了lib_handle.h中声明的跨平台动态库加载工具的macOS版本。 + * 使用POSIX标准的dlopen/dlclose/dlsym API来实现动态库的加载、卸载和函数查找功能。 + * 这些API在macOS上通过dyld(动态链接器)实现。 + * + * ## macOS动态库说明 + * - dylib:macOS标准动态库格式(类似Linux的.so) + * - framework:macOS特有的打包格式,包含库、头文件和资源 + * - bundle:可加载的插件格式(.bundle或.plugin) + * + * ## POSIX API说明 + * - dlopen:加载动态库 + * - dlclose:卸载动态库 + * - dlsym:从动态库中获取符号地址 + * - dlerror:获取最后一次错误信息 + * + * @note 使用.cpp扩展名(不是.mm),因为不需要Objective-C功能 + * @note 与Linux实现基本相同,但加载路径和搜索规则有所不同 + */ + +#include "lib_handle.h" + +#include + +/** + * @brief 打开动态库(macOS实现) + * + * 使用POSIX标准的dlopen加载动态库(.dylib、.framework或.bundle)。 + * + * ## 实现细节 + * + * ### macOS动态库类型 + * 1. **dylib** - 标准动态库 + * - 扩展名:.dylib + * - 位置:/usr/lib、/usr/local/lib等 + * - 示例:libMyLib.dylib + * + * 2. **Framework** - macOS特有格式 + * - 位置:/System/Library/Frameworks、/Library/Frameworks + * - 结构:MyFramework.framework/MyFramework + * - 包含:库、头文件、资源 + * + * 3. **Bundle** - 可加载插件 + * - 扩展名:.bundle、.plugin + * - 常用于插件系统 + * + * ### dlopen标志说明 + * - RTLD_LAZY:延迟解析符号(性能更好) + * - 仅在符号首次使用时解析 + * - 如果符号不存在,会在使用时才报错 + * + * - RTLD_NOW:立即解析所有符号 + * - 加载时解析所有符号 + * - 如果有未定义符号,dlopen会失败 + * + * - RTLD_LOCAL:符号仅在本库内可见(默认) + * - RTLD_GLOBAL:符号对后续加载的库可见 + * + * ### macOS搜索路径 + * dlopen按以下顺序搜索: + * 1. @executable_path - 可执行文件所在目录 + * 2. @loader_path - 加载库所在目录 + * 3. @rpath - 运行时搜索路径 + * 4. DYLD_LIBRARY_PATH环境变量(如果设置) + * 5. /usr/local/lib + * 6. /usr/lib + * + * ### 系统完整性保护(SIP) + * macOS 10.11+启用了SIP,限制: + * - DYLD_LIBRARY_PATH在受保护进程中被忽略 + * - 无法修改系统库路径 + * - 某些目录需要特殊权限 + * + * ### 错误处理 + * dlopen失败时返回nullptr,使用dlerror()获取错误信息: + * - "image not found" - 库文件不存在 + * - "no suitable image found" - 架构不匹配 + * - "symbol not found" - 缺少符号(RTLD_NOW模式) + * - "Library not loaded" - 缺少依赖库 + * + * @param lib_path 动态库路径(支持相对路径、绝对路径、@rpath等) + * @return bool true表示成功,false表示失败 + * + * @note 会先调用close()关闭已打开的库 + * @note 使用RTLD_LAZY以获得更好的性能 + * @note 失败时可以调用dlerror()获取详细错误信息 + */ +auto lib_handle::open(const std::filesystem::path& lib_path) -> bool { + close(); + + // 使用RTLD_LAZY延迟加载符号 + // 在macOS上,dlopen可以加载.dylib、.framework和.bundle + handle_ = dlopen(lib_path.c_str(), RTLD_LAZY); + return handle_ != nullptr; +} + +/** + * @brief 关闭动态库(macOS实现) + * + * 使用POSIX标准的dlclose卸载动态库。 + * + * ## 实现细节 + * + * ### 引用计数 + * - dlclose递减库的引用计数 + * - 当引用计数降为0时,库才会真正卸载 + * - 如果库被多次dlopen,需要相同次数的dlclose + * + * ### 析构函数 + * 库卸载时会调用: + * - C++全局对象的析构函数 + * - __attribute__((destructor))标记的函数 + * - atexit()注册的清理函数 + * + * ### macOS特性 + * - 某些系统库可能无法卸载(返回错误但不影响程序) + * - Framework的卸载也会卸载其资源和依赖 + * - 卸载时dyld会处理依赖关系 + * + * ### 线程安全 + * - dlclose是线程安全的 + * - 但需要确保没有线程正在使用库中的代码 + * - 正在执行的函数可能导致崩溃 + * + * @note 调用后handle_会被设置为nullptr + * @note 重复调用是安全的(会检查handle_是否为空) + * @note 卸载失败时dlerror()会返回错误信息 + */ +void lib_handle::close() { + if (handle_) { + dlclose(handle_); + handle_ = nullptr; + } +} + +/** + * @brief 获取函数地址(macOS实现) + * + * 使用POSIX标准的dlsym从动态库中获取符号地址。 + * + * ## 实现细节 + * + * ### 符号查找 + * dlsym可以查找: + * - C函数:直接使用函数名 + * - C++函数:需要使用extern "C"避免名称修饰 + * - 全局变量:可以获取变量地址 + * - 弱符号:如果存在返回地址,否则返回nullptr + * + * ### 名称修饰(Name Mangling) + * C++编译器会修饰函数名以支持重载: + * @code + * // C++函数 + * int add(int a, int b); // 可能被修饰为 __Z3addii + * + * // 避免修饰 + * extern "C" int add(int a, int b); // 保持为 add + * @endcode + * + * ### macOS符号约定 + * - 前导下划线:C符号通常有前导下划线(但dlsym会自动处理) + * - 隐藏符号:使用__attribute__((visibility("hidden")))的符号无法查找 + * - 弱符号:使用__attribute__((weak))的符号可以被覆盖 + * + * ### 错误处理 + * dlsym失败返回nullptr,常见原因: + * - 符号不存在 + * - 符号被标记为隐藏 + * - 名称修饰不匹配 + * - 库未正确导出符号 + * + * ### 使用示例 + * @code + * // 假设库中有:extern "C" int calculate(int x); + * lib_handle lib; + * lib.open("libmath.dylib"); + * + * // 获取函数指针 + * auto func = lib.get_function("calculate"); + * if (func) { + * int result = func(42); + * } + * @endcode + * + * @param func_name 符号名称(C风格字符串) + * @return void* 符号地址指针,失败时返回nullptr + * + * @note 如果handle_为空,直接返回nullptr + * @note 返回的指针需要转换为正确的类型才能使用 + * @note 可以使用dlerror()获取失败原因 + */ +auto lib_handle::get_raw_function(const std::string& func_name) -> void* { + if (!handle_) { + return nullptr; + } + + // dlsym返回符号地址(函数或变量) + // 在macOS上,dlsym会自动处理前导下划线 + return dlsym(handle_, func_name.c_str()); +} \ No newline at end of file diff --git a/src/misc/windows/lib_handle.cpp b/src/misc/windows/lib_handle.cpp new file mode 100644 index 0000000..7d44589 --- /dev/null +++ b/src/misc/windows/lib_handle.cpp @@ -0,0 +1,130 @@ +/** + * @file lib_handle.cpp + * @brief Windows平台动态库加载工具实现 + * + * 实现了lib_handle.h中声明的跨平台动态库加载工具的Windows版本。 + * 使用Windows API(LoadLibrary、FreeLibrary、GetProcAddress)来实现 + * 动态库的加载、卸载和函数查找功能。 + * + * ## Windows API说明 + * - LoadLibrary:加载动态链接库(DLL) + * - FreeLibrary:卸载动态链接库 + * - GetProcAddress:从DLL中获取函数地址 + * + * @note 仅在Windows平台编译 + */ + +#include "lib_handle.h" + +#include + +/** + * @brief 打开动态库(Windows实现) + * + * 使用Windows API LoadLibrary加载指定路径的DLL文件。 + * + * ## 实现细节 + * + * ### LoadLibrary行为 + * - 如果DLL已经加载,会增加其引用计数 + * - 搜索顺序: + * 1. 应用程序目录 + * 2. 系统目录(System32) + * 3. Windows目录 + * 4. 当前目录 + * 5. PATH环境变量中的目录 + * + * ### 错误处理 + * LoadLibrary失败时返回NULL,可能的原因: + * - 文件不存在 + * - 不是有效的DLL文件 + * - 缺少依赖的DLL + * - 架构不匹配(32位/64位) + * - 权限不足 + * + * @param lib_path DLL文件的路径 + * @return bool true表示成功,false表示失败 + * + * @note 会先调用close()关闭已打开的库 + * @note 失败时可以使用GetLastError()获取详细错误代码 + */ +auto lib_handle::open(const std::filesystem::path& lib_path) -> bool { + close(); + + // 使用LoadLibrary加载DLL + // lib_path.c_str()返回const char*,在Windows上会自动转换为所需的类型 + handle_ = LoadLibraryA(lib_path.string().c_str()); + return handle_ != nullptr; +} + +/** + * @brief 关闭动态库(Windows实现) + * + * 使用Windows API FreeLibrary卸载DLL。 + * + * ## 实现细节 + * + * ### 引用计数 + * - FreeLibrary递减DLL的引用计数 + * - 当引用计数降为0时,DLL才会真正卸载 + * - 如果DLL被多次LoadLibrary,需要相同次数的FreeLibrary + * + * ### 线程安全 + * - DLL的DllMain函数会在卸载时被调用(DLL_PROCESS_DETACH) + * - 需要确保没有其他线程正在使用DLL中的代码或数据 + * + * @note 调用后handle_会被设置为nullptr + * @note 重复调用是安全的(会检查handle_是否为空) + */ +void lib_handle::close() { + if (handle_) { + FreeLibrary(static_cast(handle_)); + handle_ = nullptr; + } +} + +/** + * @brief 获取函数地址(Windows实现) + * + * 使用Windows API GetProcAddress从DLL中获取导出函数的地址。 + * + * ## 实现细节 + * + * ### 函数查找 + * GetProcAddress通过函数名查找: + * - C函数:直接使用函数名 + * - C++函数:需要使用extern "C"避免名称修饰(name mangling) + * - 导出序号:也可以使用序号而不是名称(不推荐) + * + * ### 名称修饰 + * C++编译器会对函数名进行修饰,导致查找失败。解决方法: + * @code + * // 在DLL中声明函数时使用 + * extern "C" __declspec(dllexport) int my_function(int x); + * @endcode + * + * ### 调用约定 + * 需要确保函数的调用约定匹配: + * - __cdecl:C默认调用约定 + * - __stdcall:Windows API调用约定 + * - __fastcall:快速调用约定 + * + * @param func_name 函数名称(C风格字符串) + * @return void* 函数地址指针,失败时返回nullptr + * + * @note 如果handle_为空,直接返回nullptr + * @note 返回的指针需要转换为正确的函数指针类型才能调用 + * @note 使用FARPROC类型表示函数指针,然后转换为void* + */ +auto lib_handle::get_raw_function(const std::string& func_name) -> void* { + if (!handle_) { + return nullptr; + } + + // GetProcAddress返回FARPROC类型(函数指针) + // FARPROC是Windows定义的通用函数指针类型 + // 将其转换为void*以保持跨平台的接口一致性 + return reinterpret_cast( + GetProcAddress(static_cast(handle_), func_name.c_str()) + ); +} \ No newline at end of file diff --git a/src/network/CMakeLists.txt b/src/network/CMakeLists.txt index 6b0f192..7178da7 100644 --- a/src/network/CMakeLists.txt +++ b/src/network/CMakeLists.txt @@ -1,7 +1,7 @@ project(alicho_network) find_package(Boost COMPONENTS interprocess date_time thread CONFIG REQUIRED) -find_package(zeromq REQUIRED) +find_package(cppzmq CONFIG REQUIRED) find_package(cppzmq REQUIRED) find_package(yalantinglibs CONFIG REQUIRED) diff --git a/src/process_manager/process_monitor.cpp b/src/process_manager/process_monitor.cpp index e230d27..7b231b3 100644 --- a/src/process_manager/process_monitor.cpp +++ b/src/process_manager/process_monitor.cpp @@ -88,7 +88,7 @@ namespace alicho { process_error process_monitor::update_status() { try { - auto previous_state = monitored_process_.state; + // auto previous_state = monitored_process_.state; // 检查进程状态 bool is_running = check_process_running(); diff --git a/src/simd/CMakeLists.txt b/src/simd/CMakeLists.txt index cebfcf7..f0672fd 100644 --- a/src/simd/CMakeLists.txt +++ b/src/simd/CMakeLists.txt @@ -1,4 +1,7 @@ -project(alicho_simd) -simple_library(STATIC) -target_link_libraries(${PROJECT_NAME} PUBLIC alicho_misc) +add_subdirectory(simd_interface) +add_subdirectory(simd_scaler) +add_subdirectory(simd_sse) +add_subdirectory(simd_avx) +add_subdirectory(simd_avx512) +add_subdirectory(misc) diff --git a/src/simd/audio_processing/simd_audio_processing.cpp b/src/simd/audio_processing/simd_audio_processing.cpp deleted file mode 100644 index 9c2d19a..0000000 --- a/src/simd/audio_processing/simd_audio_processing.cpp +++ /dev/null @@ -1,158 +0,0 @@ -/** - * @file simd_audio_processing.cpp - * @brief SIMD音频处理函数注册模块实现 - * - * 本文件负责将所有音频处理函数(标量和SIMD优化版本)注册到SIMD函数调度器中。 - * 注册过程采用分层策略: - * 1. 根据平台(x86或ARM)选择合适的SIMD实现 - * 2. 为每个SIMD指令集版本注册对应的函数实现 - * 3. 运行时根据CPU特性自动选择最优实现 - * - * 注册流程: - * - 标量实现:所有平台通用的基准实现 - * - x86平台:SSE/SSE3/SSE4/AVX/AVX2/AVX512优化实现 - * - ARM平台:NEON优化实现 - */ - -#include "simd_audio_processing.h" - -#include "scalar_audio_processing_func.h" -#include "simd_func_dispatcher.h" -#include "x86_simd_audio_processing_func.h" -#include "arm_simd_audio_processing_func.h" - -/** - * @brief x86平台自动注册宏 - 注册所有x86 SIMD版本 - * - * 此宏为指定函数注册多个SIMD实现版本: - * - SCALAR: 标量实现(所有平台通用的基准版本) - * - SSE/SSE3/SSE4: 使用相同的SSE实现(128位向量,处理4个float) - * - AVX/AVX2: 使用相同的AVX实现(256位向量,处理8个float) - * - AVX512: 最新的AVX-512实现(512位向量,处理16个float) - * - * 注册顺序说明: - * 1. 先注册标量版本作为后备实现 - * 2. 按指令集从旧到新注册SIMD版本 - * 3. 运行时调度器会根据CPU特性选择最优版本 - * - * 注意:SSE/SSE3/SSE4共用同一实现是因为这些指令集间差异不影响音频处理性能 - */ -#if ALICHO_PLATFORM_X86 -#define AUTO_REGISTER_SIMD_FUNCTION(func_name)\ - REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::SCALAR, scalar_audio_processing_func::func_name); \ - REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::SSE, x86_simd_audio_processing_func::func_name##_sse); \ - REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::SSE3, x86_simd_audio_processing_func::func_name##_sse); \ - REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::SSE4, x86_simd_audio_processing_func::func_name##_sse); \ - REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::AVX, x86_simd_audio_processing_func::func_name##_avx); \ - REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::AVX2, x86_simd_audio_processing_func::func_name##_avx); \ - REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::AVX512, x86_simd_audio_processing_func::func_name##_avx512); - -/** - * @brief ARM平台自动注册宏 - 注册标量和NEON版本 - * - * ARM平台注册策略: - * - SCALAR: 标量实现(所有平台通用) - * - NEON: ARM的SIMD指令集实现(128位向量,处理4个float) - * - * 注册顺序说明: - * 1. 先注册标量版本作为后备实现 - * 2. 注册NEON优化版本(大多数现代ARM处理器都支持) - * 3. 运行时根据CPU是否支持NEON自动选择 - */ -#elif ALICHO_PLATFORM_ARM -#define AUTO_REGISTER_SIMD_FUNCTION(func_name) \ - REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::SCALAR, scalar_audio_processing_func::func_name); \ - REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::NEON, neon_simd_audio_processing_func::func_name##_neon); -#endif - -/** - * @brief 强制使用标量实现的注册宏 - * - * 某些函数可能由于以下原因只使用标量实现: - * 1. 算法特性不适合SIMD优化(如分支过多) - * 2. SIMD实现收益不明显甚至可能降低性能 - * 3. 实现复杂度过高,维护成本超过性能收益 - * - * 此宏将标量实现注册到所有SIMD版本槽位,确保: - * - 无论CPU支持何种指令集,都使用相同的标量实现 - * - 避免因缺少SIMD实现导致的运行时错误 - * - 保持API一致性,调用方无需关心实现细节 - * - * 当前使用此宏的函数: - * - apply_gain: 虽然可以SIMD优化,但此处暂时使用标量版本 - */ -#define FORCE_SCALAR_SIMD_FUNCTION(func_name) \ - REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::SCALAR, scalar_audio_processing_func::func_name); \ - REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::SSE, scalar_audio_processing_func::func_name); \ - REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::SSE3, scalar_audio_processing_func::func_name); \ - REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::SSE4, scalar_audio_processing_func::func_name); \ - REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::AVX, scalar_audio_processing_func::func_name); \ - REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::AVX2, scalar_audio_processing_func::func_name); \ - REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::AVX512, scalar_audio_processing_func::func_name); \ - REGISTER_SIMD_FUNCTION(#func_name, simd_func_version::NEON, scalar_audio_processing_func::func_name); - -/** - * @brief 注册所有音频处理函数到SIMD调度器 - * - * 此函数在程序初始化时调用,负责: - * 1. 将所有音频处理函数的各个SIMD版本注册到调度器 - * 2. 建立函数名到实现的映射关系 - * 3. 为运行时动态分发做准备 - * - * 注册的函数列表: - * - mix_audio: 音频混合(两路音频相加) - * - apply_gain: 音量增益调节(暂时使用标量版本) - * - calculate_rms: RMS电平计算(均方根值) - * - calculate_peak: 峰值电平检测 - * - normalize_audio: 音频归一化处理 - * - stereo_to_mono: 立体声转单声道 - * - limit_audio: 音频限幅器(动态范围压缩) - * - fade_audio: 淡入淡出效果 - * - simple_eq: 简单三段均衡器(低频/中频/高频) - * - * 注册顺序考虑: - * - 按功能类型分组:基础操作 -> 分析 -> 效果处理 - * - 简单函数在前,复杂函数在后 - * - 便于理解和维护 - * - * 错误处理: - * - REGISTER_SIMD_FUNCTION宏内部会处理重复注册 - * - 如果函数不存在会在编译时报错(类型安全) - */ -void audio_processing_registry::register_all_functions() { - AUTO_REGISTER_SIMD_FUNCTION(mix_audio); // 音频混合 - FORCE_SCALAR_SIMD_FUNCTION(apply_gain); // 增益调节(标量版本) - AUTO_REGISTER_SIMD_FUNCTION(calculate_rms); // RMS计算 - AUTO_REGISTER_SIMD_FUNCTION(calculate_peak); // 峰值检测 - AUTO_REGISTER_SIMD_FUNCTION(normalize_audio); // 归一化 - AUTO_REGISTER_SIMD_FUNCTION(stereo_to_mono); // 立体声转单声道 - AUTO_REGISTER_SIMD_FUNCTION(limit_audio); // 限幅器 - AUTO_REGISTER_SIMD_FUNCTION(fade_audio); // 淡入淡出 - AUTO_REGISTER_SIMD_FUNCTION(simple_eq); // 简单均衡器 -} - -/** - * @brief 打印所有已注册函数的状态信息 - * - * 此函数用于调试和诊断,输出内容包括: - * 1. 已注册的函数名称列表 - * 2. 每个函数可用的SIMD版本 - * 3. 当前CPU支持的指令集 - * 4. 运行时将使用的具体实现版本 - * - * 使用场景: - * - 开发调试:验证函数是否正确注册 - * - 性能分析:确认使用了最优的SIMD版本 - * - 问题诊断:检查CPU特性检测是否正常 - * - 用户支持:提供系统配置信息 - * - * 输出示例: - * "Function: mix_audio - * - SCALAR: available - * - SSE: available - * - AVX: available (selected) - * - AVX512: not available (CPU not supported)" - */ -void audio_processing_registry::print_available_functions() { - simd_func_dispatcher::instance().print_registry_status(); -} diff --git a/src/simd/audio_processing/simd_audio_processing.h b/src/simd/audio_processing/simd_audio_processing.h deleted file mode 100644 index 0c03f31..0000000 --- a/src/simd/audio_processing/simd_audio_processing.h +++ /dev/null @@ -1,146 +0,0 @@ -/** - * @file simd_audio_processing.h - * @brief SIMD音频处理函数注册器 - 音频处理功能的统一注册和管理入口 - * - * 本文件定义了音频处理函数的注册器类,负责将所有音频处理函数 - * (包括标量版本和各种SIMD优化版本)注册到函数分发器中。 - * - * 核心职责: - * 1. **函数注册**:将所有音频处理函数注册到SIMD函数分发器 - * 2. **平台适配**:根CPU特性选择合适的SIMD实现 - * 3. **性能优化**:确保运行时使用最优的函数实现 - * 4. **调试支持**:提供可用函数列表打印功能 - * - * 与函数分发器的关系: - * ``` - * simd_func_dispatcher (分发器) - * ↑ - * | 注册 - * | - * audio_processing_registry (本文件) - * | - * | 包含 - * ↓ - * 标量版本 + SSE + AVX + AVX512 + NEON (实现) - * ``` - * - * 支持的音频处理功能: - * - 音频混合 (mix_audio) - * - 增益控制 (apply_gain) - * - RMS计算 (calculate_rms) - * - 峰值检测 (calculate_peak) - * - 音频归一化 (normalize_audio) - * - 立体声转单声道 (stereo_to_mono) - * - 音频限幅 (limit_audio) - * - 淡入淡出 (fade_audio) - * - 三段均衡器 (simple_eq) - * - * 使用方式: - * ```cpp - * // 在程序启动时调用一次 - * audio_processing_registry::register_all_functions(); - * - * // 调试时查看可用函数 - * audio_processing_registry::print_available_functions(); - * - * // 之后通过函数分发器使用 - * auto func = simd_func_dispatcher::get_function<...>("mix_audio"); - * ``` - * - * @note 此类只包含静态方法,不需要实例化 - * @see simd_func_dispatcher.h 函数分发器的定义 - * @see scalar_audio_processing_func.h 标量实现 - * @see x86_simd_audio_processing_func.h x86 SIMD实现 - * @see arm_simd_audio_processing_func.h ARM NEON实现 - */ - -#pragma once - -/** - * @class audio_processing_registry - * @brief 音频处理函数注册器 - * - * 负责将所有音频处理函数注册到SIMD函数分发器中。 - * 注册过程会根据当前CPU的特性,自动选择最优的实现版本。 - * - * 注册顺序和优先级: - * 1. 首先注册标量版本(保底实现,所有平台可用) - * 2. 然后注册SIMD版本如果CPU支持): - * - x86平台:SSE -> AVX -> AVX512(按性能递增) - * - ARM平台:NEON - * - * 函数分发器会根据注册顺序,优先使用后注册的高性能版本。 - * - * 线程安全性: - * - register_all_functions() 应该在程序启动时调用一次 - * - 不是线程安全的,不应该并发调用 - * - 注册完成后,使用函数是线程安全的 - */ -class audio_processing_registry { -public: - /** - * @brief 注册所有音频处理函数 - * - * 将所有支持的音频处理函数注册到SIMD函数分发器中。 - * 此函数会检测当前CPU特性,并注册所有兼容的实现版本。 - * - * 注册的函数包括: - * - 音频信号处理:mix_audio, apply_gain - * - 音频分析:calculate_rms, calculate_peak - * - 音频处理效果:normalize_audio, stereo_to_mono, limit_audio, fade_audio, simple_eq - * - * 每个函数都会注册多个版本(如果CPU支持): - * - 标量版本(必定存在) - * - SSE版本(x86平台,如果支持) - * - AVX版本(x86平台,如果支持) - * - AVX512版本(x86平台,如果支持) - * - NEON版本(ARM平台如果支持) - * - * @note 应该在程序启动早期调用,只需调用一次 - * @note 不是线程安全的,不应并发调用 - * @warning 重复调用可能导致重复注册 - * - * 使用示例: - * ```cpp - * int main() { - * // 初始化阶段 - * audio_processing_registry::register_all_functions(); - * - * // 后续使用 - * auto mix_func = simd_func_dispatcher::get_function<...>("mix_audio"); - * mix_func(src1, src2, dst, samples); - * return 0; - * } - * ``` - */ - static void register_all_functions(); - - /** - * @brief 打印所有可用的音频处理函数 - * - * 输出所有已注册的音频处理函数及其实现版本。 - * 用于调试和验证函数注册是否成功。 - * - * 输出格式示例: - * ``` - * Available audio processing functions: - * mix_audio: - * - scalar (baseline) - * - sse (4x SIMD) - * - avx (8x SIMD) - * apply_gain: - * - scalar (baseline) - * - neon (ARM SIMD) - * ... - * ``` - * - * 应用场景: - * - 验证SIMD函数是否正确注册 - * - 检查当前平台支持哪些优化版本 - * - 性能调试和分析 - * - * @note 此函数仅用于调试,不影响程序功能 - * @note 输出会打印到标准输出或日志系统 - */ - static void print_available_functions(); -}; diff --git a/src/simd/audio_processing/x86_simd_audio_processing_func.cpp b/src/simd/audio_processing/x86_simd_audio_processing_func.cpp deleted file mode 100644 index 579cb84..0000000 --- a/src/simd/audio_processing/x86_simd_audio_processing_func.cpp +++ /dev/null @@ -1,2460 +0,0 @@ -/** - * @file x86_simd_audio_processing_func.cpp - * @brief x86 SIMD音频处理函数实现(SSE/AVX/AVX512) - * - * 本文件提供x86架构的SIMD优化音频处理函数,包含三个指令集版本: - * - * 1. **SSE版本**(128位向量,处理4个float) - * - 指令集:SSE/SSE2/SSE3/SSE4共用实现 - * - 向量寄存器:XMM(128位,16字节对齐) - * - 适用于2004年后的所有x86处理器 - * - 性能提升:相比标量版本约4倍 - * - * 2. **AVX版本**(256位向量,处理8个float) - * - 指令集:AVX/AVX2共用实现 - * - 向量寄存器:YMM(256位,32字节对齐) - * - 适用于2011年后的Intel/AMD处理器 - * - 性能提升:相比标量版本约8倍 - * - * 3. **AVX-512版本**(512位向量,处理16个float) - * - 指令集:AVX-512F(基础) - * - 向量寄存器:ZMM(512位,64字节对齐) - * - 适用于Intel Skylake-X及更新处理器 - * - 性能提升:相比标量版本约16倍 - * - * **SIMD优化策略**: - * - * 1. **循环展开(Loop Unrolling)** - * - 主循环采用4路展,提高指令级并行性(ILP) - * - 减少循环控制开销 - * - 充分利用CPU的多发射和乱序执行能力 - * - * 2. **数据对齐** - * - 所有数据指针必须对齐到指令集要求的边界 - * - 使用对齐加载/存储指令(_mm_load_ps/_mm_store_ps) - * - 对齐访问比非对齐访问快约20-50% - * - * 3. **剩余元素处理** - * - 主循环处理对齐的向量块 - * - 次级循环处理剩余的完整向量 - * - 标量循环处理最后的零散样本 - * - * 4. **寄存器压力管理** - * - 合理使用向量寄存器,避免寄存器溢出 - * - 循环展开数量平衡性能和寄存器使用 - * - * 5. **SIMD Intrinsics说明** - * - _mm_load_ps: 加载4个对齐的float(SSE) - * - _mm256_load_ps: 加载8个对齐的float(AVX) - * - _mm512_load_ps: 加载16个对齐的float(AVX-512) - * - _mm_add_ps: 4个float并行加法(SSE) - * - _mm_mul_ps: 4个float并行乘法(SSE) - * - _mm_store_ps: 存储4个对齐的float(SSE) - */ - -#include "x86_simd_audio_processing_func.h" - -#include -#include // SIMD内联函数所需的头文件 - -#include "aligned_allocator.h" - -#if ALICHO_PLATFORM_X86 -namespace x86_simd_audio_processing_func { - /** - * @brief 音频混合函数(SSE优化版本) - * - * 使用SSE指令集实现两路音频的并行相加。 - * - * **向量化策略**: - * - 每个SSE向量处理4个float样本 - * - 4路循环展开,一次处理16个样本 - * - 充分利用CPU的指令级并行 - * - * **关键SIMD指令**: - * - _mm_load_ps: 对齐加载4个float到128位XMM寄存器 - * - _mm_add_ps: 并行执行4个float加法(1个CPU周期) - * - _mm_store_ps: 对齐存储4个float到内存 - * - * **性能分析**: - * - 标量版本:16个样本需要16次加法 - * - SSE版本:16个样本需要4次向量加法 - * - 理论加速比:4倍(实际约3.5-3.8倍,考虑加载/存储开销) - * - * **数据对齐**: - * - 所有指针必须16字节对齐(ALIGNMENT_SSE) - * - 对齐加载比非对齐加载快约30% - * - ASSERT_ALIGNED宏在Debug模式检查对齐 - * - * @param src1 第一路输入(16字节对齐) - * @param src2 第二路输入(16字节对齐) - * @param dst 输出缓冲区(16字节对齐) - * @param num_samples 样本数量 - */ - void mix_audio_sse(const float* src1, const float* src2, float* dst, size_t num_samples) { - // 对齐检查:确保指针满足SSE对齐要求(16字节) - ASSERT_ALIGNED(src1, ALIGNMENT_SSE); - ASSERT_ALIGNED(src2, ALIGNMENT_SSE); - ASSERT_ALIGNED(dst, ALIGNMENT_SSE); - - constexpr size_t simd_width = 4; // SSE每次处理4个float - constexpr size_t unroll_factor = 4; // 4路循环展开提高指令级并行性 - size_t i = 0; - - // **主循环**:向量化处理(4路循环展开) - // 每次迭代处理16个样本(4个向量 x 4个float) - for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) { - // 加载第一组4个向量(16个float) - auto a0 = _mm_load_ps(&src1[i]); // 加载src1[i..i+3] - auto a1 = _mm_load_ps(&src1[i + 4]); // 加载src1[i+4..i+7] - auto a2 = _mm_load_ps(&src1[i + 8]); // 加载src1[i+8..i+11] - auto a3 = _mm_load_ps(&src1[i + 12]); // 加载src1[i+12..i+15] - - // 加载第二组4个向量 - auto b0 = _mm_load_ps(&src2[i]); - auto b1 = _mm_load_ps(&src2[i + 4]); - auto b2 = _mm_load_ps(&src2[i + 8]); - auto b3 = _mm_load_ps(&src2[i + 12]); - - // 并行计算:每条指令同时执行4个加法 - auto result0 = _mm_add_ps(a0, b0); // result0[j] = a0[j] + b0[j], j=0..3 - auto result1 = _mm_add_ps(a1, b1); - auto result2 = _mm_add_ps(a2, b2); - auto result3 = _mm_add_ps(a3, b3); - - // 存储结果到内存 - _mm_store_ps(&dst[i], result0); - _mm_store_ps(&dst[i + 4], result1); - _mm_store_ps(&dst[i + 8], result2); - _mm_store_ps(&dst[i + 12], result3); - } - - // **次级循环**:处理剩余的完整向量(1-3个向量,4-12个样本) - for (; i + simd_width <= num_samples; i += simd_width) { - auto a = _mm_load_ps(&src1[i]); - auto b = _mm_load_ps(&src2[i]); - auto result = _mm_add_ps(a, b); - _mm_store_ps(&dst[i], result); - } - - // **收尾循环**:标量处理剩余样本(0-3个样本) - // 无法凑成完整向量的样本用标量方式处理 - for (; i < num_samples; ++i) { - dst[i] = src1[i] + src2[i]; - } - } - - /** - * @brief 音频混合函数(AVX优化版本) - * - * 使用AVX指令集实现音频混合,向量宽度是SSE的2倍。 - * - * **AVX vs SSE对比**: - * - 向量宽度:256位 vs 128位(2倍) - * - 单次处理:8个float vs 4个float - * - 寄存器:YMM (256位) vs XMM (128位) - * - 对齐要求:32字节 vs 16字节 - * - * **性能提升**: - * - 相比SSE版本约2倍加速 - * - 相比标量版本约8倍加速 - * - 主循环每次处理32个样本(4向量 x 8float) - * - * **关键AVX指令**: - * - _mm256_load_ps: 加载8个对齐float到YMM寄存器 - * - _mm256_add_ps: 并行执行8个float加法 - * - _mm256_store_ps: 存储8个float到内存 - * - * @param src1 第一路输入(32字节对齐) - * @param src2 第二路输入(32字节对齐) - * @param dst 输出缓冲区(32字节对齐) - * @param num_samples 样本数量 - */ - void mix_audio_avx(const float* src1, const float* src2, float* dst, size_t num_samples) { - ASSERT_ALIGNED(src1, ALIGNMENT_AVX); - ASSERT_ALIGNED(src2, ALIGNMENT_AVX); - ASSERT_ALIGNED(dst, ALIGNMENT_AVX); - - constexpr size_t simd_width = 8; // AVX每次处理8个float - constexpr size_t unroll_factor = 4; // 4路循环展开提高指令级并行性 - size_t i = 0; - - // **主循环**:4路展开,每次处理32个样本 - for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) { - // 加载4个向量 - auto a0 = _mm256_load_ps(&src1[i]); - auto a1 = _mm256_load_ps(&src1[i + 8]); - auto a2 = _mm256_load_ps(&src1[i + 16]); - auto a3 = _mm256_load_ps(&src1[i + 24]); - - auto b0 = _mm256_load_ps(&src2[i]); - auto b1 = _mm256_load_ps(&src2[i + 8]); - auto b2 = _mm256_load_ps(&src2[i + 16]); - auto b3 = _mm256_load_ps(&src2[i + 24]); - - // 并行计算 - auto result0 = _mm256_add_ps(a0, b0); - auto result1 = _mm256_add_ps(a1, b1); - auto result2 = _mm256_add_ps(a2, b2); - auto result3 = _mm256_add_ps(a3, b3); - - // 存储结果 - _mm256_store_ps(&dst[i], result0); - _mm256_store_ps(&dst[i + 8], result1); - _mm256_store_ps(&dst[i + 16], result2); - _mm256_store_ps(&dst[i + 24], result3); - } - - // 处理剩余的向量(单次处理) - for (; i + simd_width <= num_samples; i += simd_width) { - auto a = _mm256_load_ps(&src1[i]); - auto b = _mm256_load_ps(&src2[i]); - auto result = _mm256_add_ps(a, b); - _mm256_store_ps(&dst[i], result); - } - - // 处理剩余的标量样本 - for (; i < num_samples; ++i) { - dst[i] = src1[i] + src2[i]; - } - } - - /** - * @brief 音频混合函数(AVX-512优化版本) - * - * 使用AVX-512指令集,提供最高的并行度。 - * - * **AVX-512特性**: - * - 向量宽度:512位(AVX的2倍,SSE的4倍) - * - 单次处理:16个float - * - 寄存器:ZMM (512位,共32个) - * - 对齐要求:64字节 - * - 掩码寄存器:支持条件执行(本函数未使用) - * - * **性能特性**: - * - 相比AVX版本约2倍加速 - * - 相比标量版本约16倍加速 - * - 主循环每次处理64个样本 - * - 适合大批量音频处理 - * - * **能耗考虑**: - * - AVX-512可能导致CPU降频(某些处理器) - * - 建议在持续高负载场景使用 - * - 短时burst处理可能不如AVX - * - * **关键AVX-512指令**: - * - _mm512_load_ps: 加载16个对齐float - * - _mm512_add_ps: 并行16个float加法 - * - _mm512_store_ps: 存储16个float - * - * @param src1 第一路输入(64字节对齐) - * @param src2 第二路输入(64字节对齐) - * @param dst 输出缓冲区(64字节对齐) - * @param num_samples 样本数量 - */ - void mix_audio_avx512(const float* src1, const float* src2, float* dst, size_t num_samples) { - ASSERT_ALIGNED(src1, ALIGNMENT_AVX512); - ASSERT_ALIGNED(src2, ALIGNMENT_AVX512); - ASSERT_ALIGNED(dst, ALIGNMENT_AVX512); - - constexpr size_t simd_width = 16; // AVX-512每次处理16个float - constexpr size_t unroll_factor = 4; // 4路循环展开提高指令级并行性 - size_t i = 0; - - // **主循环**:4路展开,每次处理64个样本 - for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) { - // 加载4个向量 - const auto a0 = _mm512_load_ps(&src1[i]); - const auto a1 = _mm512_load_ps(&src1[i + 16]); - const auto a2 = _mm512_load_ps(&src1[i + 32]); - const auto a3 = _mm512_load_ps(&src1[i + 48]); - - const auto b0 = _mm512_load_ps(&src2[i]); - const auto b1 = _mm512_load_ps(&src2[i + 16]); - const auto b2 = _mm512_load_ps(&src2[i + 32]); - const auto b3 = _mm512_load_ps(&src2[i + 48]); - - // 并行计算 - const auto result0 = _mm512_add_ps(a0, b0); - const auto result1 = _mm512_add_ps(a1, b1); - const auto result2 = _mm512_add_ps(a2, b2); - const auto result3 = _mm512_add_ps(a3, b3); - - // 存储结果 - _mm512_store_ps(&dst[i], result0); - _mm512_store_ps(&dst[i + 16], result1); - _mm512_store_ps(&dst[i + 32], result2); - _mm512_store_ps(&dst[i + 48], result3); - } - - // 处理剩余的向量(单次处理) - for (; i + simd_width <= num_samples; i += simd_width) { - auto a = _mm512_load_ps(&src1[i]); - auto b = _mm512_load_ps(&src2[i]); - auto result = _mm512_add_ps(a, b); - _mm512_store_ps(&dst[i], result); - } - - // 处理剩余的标量样本 - for (; i < num_samples; ++i) { - dst[i] = src1[i] + src2[i]; - } - } - - void apply_gain_sse(const float* src, float* dst, float gain, size_t num_samples) { - ASSERT_ALIGNED(src, ALIGNMENT_SSE); - ASSERT_ALIGNED(dst, ALIGNMENT_SSE); - - constexpr size_t simd_width = 4; // SSE每次处理4个float - constexpr size_t unroll_factor = 4; // 4路循环展开提高指令级并行性 - size_t i = 0; - - auto gain_vec = _mm_set1_ps(gain); - - // 向量化处理(4路循环展开) - for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) { - // 加载4个向量 - auto a0 = _mm_load_ps(&src[i]); - auto a1 = _mm_load_ps(&src[i + 4]); - auto a2 = _mm_load_ps(&src[i + 8]); - auto a3 = _mm_load_ps(&src[i + 12]); - - // 并行计算增益应用 - auto result0 = _mm_mul_ps(a0, gain_vec); - auto result1 = _mm_mul_ps(a1, gain_vec); - auto result2 = _mm_mul_ps(a2, gain_vec); - auto result3 = _mm_mul_ps(a3, gain_vec); - - // 存储结果 - _mm_store_ps(&dst[i], result0); - _mm_store_ps(&dst[i + 4], result1); - _mm_store_ps(&dst[i + 8], result2); - _mm_store_ps(&dst[i + 12], result3); - } - - // 处理剩余的向量(单次处理) - for (; i + simd_width <= num_samples; i += simd_width) { - auto a = _mm_load_ps(&src[i]); - auto result = _mm_mul_ps(a, gain_vec); - _mm_store_ps(&dst[i], result); - } - - // 处理剩余的标量样本 - for (; i < num_samples; ++i) { - dst[i] = src[i] * gain; - } - } - - void apply_gain_avx(const float* src, float* dst, float gain, size_t num_samples) { - ASSERT_ALIGNED(src, ALIGNMENT_AVX); - ASSERT_ALIGNED(dst, ALIGNMENT_AVX); - - constexpr size_t simd_width = 8; // AVX每次处理8个float - constexpr size_t unroll_factor = 4; // 4路循环展开提高指令级并行性 - size_t i = 0; - - auto gain_vec = _mm256_set1_ps(gain); - - // 向量化处理(4路循环展开) - for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) { - // 加载4个向量 - auto a0 = _mm256_load_ps(&src[i]); - auto a1 = _mm256_load_ps(&src[i + 8]); - auto a2 = _mm256_load_ps(&src[i + 16]); - auto a3 = _mm256_load_ps(&src[i + 24]); - - // 并行计算增益应用 - auto result0 = _mm256_mul_ps(a0, gain_vec); - auto result1 = _mm256_mul_ps(a1, gain_vec); - auto result2 = _mm256_mul_ps(a2, gain_vec); - auto result3 = _mm256_mul_ps(a3, gain_vec); - - // 存储结果 - _mm256_store_ps(&dst[i], result0); - _mm256_store_ps(&dst[i + 8], result1); - _mm256_store_ps(&dst[i + 16], result2); - _mm256_store_ps(&dst[i + 24], result3); - } - - // 处理剩余的向量(单次处理) - for (; i + simd_width <= num_samples; i += simd_width) { - auto a = _mm256_load_ps(&src[i]); - auto result = _mm256_mul_ps(a, gain_vec); - _mm256_store_ps(&dst[i], result); - } - - // 处理剩余的标量样本 - for (; i < num_samples; ++i) { - dst[i] = src[i] * gain; - } - } - - void apply_gain_avx512(const float* src, float* dst, float gain, size_t num_samples) { - ASSERT_ALIGNED(src, ALIGNMENT_AVX512); - ASSERT_ALIGNED(dst, ALIGNMENT_AVX512); - - constexpr size_t simd_width = 16; // AVX-512每次处理16个float - constexpr size_t unroll_factor = 4; // 4路循环展开提高指令级并行性 - size_t i = 0; - - auto gain_vec = _mm512_set1_ps(gain); - - // 向量化处理(4路循环展开) - for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) { - // 加载4个向量 - auto a0 = _mm512_load_ps(&src[i]); - auto a1 = _mm512_load_ps(&src[i + 16]); - auto a2 = _mm512_load_ps(&src[i + 32]); - auto a3 = _mm512_load_ps(&src[i + 48]); - - // 并行计算增益应用 - auto result0 = _mm512_mul_ps(a0, gain_vec); - auto result1 = _mm512_mul_ps(a1, gain_vec); - auto result2 = _mm512_mul_ps(a2, gain_vec); - auto result3 = _mm512_mul_ps(a3, gain_vec); - - // 存储结果 - _mm512_store_ps(&dst[i], result0); - _mm512_store_ps(&dst[i + 16], result1); - _mm512_store_ps(&dst[i + 32], result2); - _mm512_store_ps(&dst[i + 48], result3); - } - - // 处理剩余的向量(单次处理) - for (; i + simd_width <= num_samples; i += simd_width) { - auto a = _mm512_load_ps(&src[i]); - auto result = _mm512_mul_ps(a, gain_vec); - _mm512_store_ps(&dst[i], result); - } - - // 处理剩余的标量样本 - for (; i < num_samples; ++i) { - dst[i] = src[i] * gain; - } - } - - float calculate_rms_sse(const float* src, size_t num_samples) { - ASSERT_ALIGNED(src, ALIGNMENT_SSE); - - constexpr size_t simd_width = 4; // SSE每次处理4个float - constexpr size_t unroll_factor = 4; // 4路循环展开提高指令级并行性 - size_t i = 0; - auto sum_squares0 = _mm_setzero_ps(); - auto sum_squares1 = _mm_setzero_ps(); - auto sum_squares2 = _mm_setzero_ps(); - auto sum_squares3 = _mm_setzero_ps(); - - // 向量化处理(4路循环展开) - for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) { - // 加载4个向量 - const auto a0 = _mm_load_ps(&src[i]); - const auto a1 = _mm_load_ps(&src[i + 4]); - const auto a2 = _mm_load_ps(&src[i + 8]); - const auto a3 = _mm_load_ps(&src[i + 12]); - - // 并行计算平方 - const auto squared0 = _mm_mul_ps(a0, a0); - const auto squared1 = _mm_mul_ps(a1, a1); - const auto squared2 = _mm_mul_ps(a2, a2); - const auto squared3 = _mm_mul_ps(a3, a3); - - // 累加到各自的累加器 - sum_squares0 = _mm_add_ps(sum_squares0, squared0); - sum_squares1 = _mm_add_ps(sum_squares1, squared1); - sum_squares2 = _mm_add_ps(sum_squares2, squared2); - sum_squares3 = _mm_add_ps(sum_squares3, squared3); - } - - // 合并4个累加器 - auto sum_squares = _mm_add_ps(_mm_add_ps(sum_squares0, sum_squares1), - _mm_add_ps(sum_squares2, sum_squares3)); - - // 处理剩余的向量(单次处理) - for (; i + simd_width <= num_samples; i += simd_width) { - const auto a = _mm_load_ps(&src[i]); - const auto squared = _mm_mul_ps(a, a); - sum_squares = _mm_add_ps(sum_squares, squared); - } - - // **关键优化:高效的SSE水平归约操作** - // 使用hadd指令进行水平加法,避免内存存储+循环 - auto hadd1 = _mm_hadd_ps(sum_squares, sum_squares); // [a+b, c+d, a+b, c+d] - auto hadd2 = _mm_hadd_ps(hadd1, hadd1); // [a+b+c+d, *, a+b+c+d, *] - double total_sum = _mm_cvtss_f32(hadd2); - - // 处理剩余的标量样本 - for (; i < num_samples; ++i) { - total_sum += static_cast(src[i]) * static_cast(src[i]); - } - - return static_cast(std::sqrt(total_sum / static_cast(num_samples))); - } - - float calculate_rms_avx(const float* src, size_t num_samples) { - ASSERT_ALIGNED(src, ALIGNMENT_AVX); - - constexpr size_t simd_width = 8; // AVX每次处理8个float - constexpr size_t unroll_factor = 4; // 4路循环展开提高指令级并行性 - size_t i = 0; - auto sum_squares0 = _mm256_setzero_ps(); - auto sum_squares1 = _mm256_setzero_ps(); - auto sum_squares2 = _mm256_setzero_ps(); - auto sum_squares3 = _mm256_setzero_ps(); - - // 向量化处理(4路循环展开) - for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) { - // 加载4个向量 - const auto a0 = _mm256_load_ps(&src[i]); - const auto a1 = _mm256_load_ps(&src[i + 8]); - const auto a2 = _mm256_load_ps(&src[i + 16]); - const auto a3 = _mm256_load_ps(&src[i + 24]); - - // 并行计算平方 - const auto squared0 = _mm256_mul_ps(a0, a0); - const auto squared1 = _mm256_mul_ps(a1, a1); - const auto squared2 = _mm256_mul_ps(a2, a2); - const auto squared3 = _mm256_mul_ps(a3, a3); - - // 累加到各自的累加器 - sum_squares0 = _mm256_add_ps(sum_squares0, squared0); - sum_squares1 = _mm256_add_ps(sum_squares1, squared1); - sum_squares2 = _mm256_add_ps(sum_squares2, squared2); - sum_squares3 = _mm256_add_ps(sum_squares3, squared3); - } - - // 合并4个累加器 - auto sum_squares = _mm256_add_ps(_mm256_add_ps(sum_squares0, sum_squares1), - _mm256_add_ps(sum_squares2, sum_squares3)); - - // 处理剩余的向量(单次处理) - for (; i + simd_width <= num_samples; i += simd_width) { - const auto a = _mm256_load_ps(&src[i]); - const auto squared = _mm256_mul_ps(a, a); - sum_squares = _mm256_add_ps(sum_squares, squared); - } - - // **关键优化:高效的AVX水平归约操作** - // 使用hadd + extract指令避免内存存储+循环 - auto hadd1 = _mm256_hadd_ps(sum_squares, sum_squares); - auto hadd2 = _mm256_hadd_ps(hadd1, hadd1); - - // 提取高低128位并相加 - auto low = _mm256_extractf128_ps(hadd2, 0); - auto high = _mm256_extractf128_ps(hadd2, 1); - auto final_sum = _mm_add_ps(low, high); - double total_sum = _mm_cvtss_f32(final_sum); - - // 处理剩余的标量样本 - for (; i < num_samples; ++i) { - total_sum += static_cast(src[i]) * static_cast(src[i]); - } - - return static_cast(std::sqrt(total_sum / static_cast(num_samples))); - } - - float calculate_rms_avx512(const float* src, size_t num_samples) { - ASSERT_ALIGNED(src, ALIGNMENT_AVX512); - - constexpr size_t simd_width = 16; // AVX-512每次处理16个float - constexpr size_t unroll_factor = 4; // 4路循环展开提高指令级并行性 - size_t i = 0; - auto sum_squares0 = _mm512_setzero_ps(); - auto sum_squares1 = _mm512_setzero_ps(); - auto sum_squares2 = _mm512_setzero_ps(); - auto sum_squares3 = _mm512_setzero_ps(); - - // 向量化处理(4路循环展开) - for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) { - // 加载4个向量 - const auto a0 = _mm512_load_ps(&src[i]); - const auto a1 = _mm512_load_ps(&src[i + 16]); - const auto a2 = _mm512_load_ps(&src[i + 32]); - const auto a3 = _mm512_load_ps(&src[i + 48]); - - // 并行计算平方 - const auto squared0 = _mm512_mul_ps(a0, a0); - const auto squared1 = _mm512_mul_ps(a1, a1); - const auto squared2 = _mm512_mul_ps(a2, a2); - const auto squared3 = _mm512_mul_ps(a3, a3); - - // 累加到各自的累加器 - sum_squares0 = _mm512_add_ps(sum_squares0, squared0); - sum_squares1 = _mm512_add_ps(sum_squares1, squared1); - sum_squares2 = _mm512_add_ps(sum_squares2, squared2); - sum_squares3 = _mm512_add_ps(sum_squares3, squared3); - } - - // 合并4个累加器 - auto sum_squares = _mm512_add_ps(_mm512_add_ps(sum_squares0, sum_squares1), - _mm512_add_ps(sum_squares2, sum_squares3)); - - // 处理剩余的向量(单次处理) - for (; i + simd_width <= num_samples; i += simd_width) { - const auto a = _mm512_load_ps(&src[i]); - const auto squared = _mm512_mul_ps(a, a); - sum_squares = _mm512_add_ps(sum_squares, squared); - } - - // **AVX-512水平归约优化**: - // 使用专用reduce指令,单条指令完成所有16个元素的求和 - // 这是AVX-512相比AVX/SSE的重大优势 - double total_sum = _mm512_reduce_add_ps(sum_squares); - - // 处理剩余的标量样本 - for (; i < num_samples; ++i) { - total_sum += static_cast(src[i]) * static_cast(src[i]); - } - - return static_cast(std::sqrt(total_sum / static_cast(num_samples))); - } - - float calculate_peak_sse(const float* src, size_t num_samples) { - ASSERT_ALIGNED(src, ALIGNMENT_SSE); - - constexpr size_t simd_width = 4; // SSE每次处理4个float - constexpr size_t unroll_factor = 4; // 4路循环展开提高指令级并行性 - size_t i = 0; - auto peak_vec0 = _mm_setzero_ps(); - auto peak_vec1 = _mm_setzero_ps(); - auto peak_vec2 = _mm_setzero_ps(); - auto peak_vec3 = _mm_setzero_ps(); - - // 向量化处理(4路循环展开) - for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) { - // 加载4个向量 - const auto a0 = _mm_load_ps(&src[i]); - const auto a1 = _mm_load_ps(&src[i + 4]); - const auto a2 = _mm_load_ps(&src[i + 8]); - const auto a3 = _mm_load_ps(&src[i + 12]); - - // 并行计算绝对值 - const auto abs_a0 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a0); - const auto abs_a1 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a1); - const auto abs_a2 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a2); - const auto abs_a3 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a3); - - // 更新各自的峰值向量 - peak_vec0 = _mm_max_ps(peak_vec0, abs_a0); - peak_vec1 = _mm_max_ps(peak_vec1, abs_a1); - peak_vec2 = _mm_max_ps(peak_vec2, abs_a2); - peak_vec3 = _mm_max_ps(peak_vec3, abs_a3); - } - - // 合并4个峰值向量 - auto peak_vec = _mm_max_ps(_mm_max_ps(peak_vec0, peak_vec1), - _mm_max_ps(peak_vec2, peak_vec3)); - - // 处理剩余的向量(单次处理) - for (; i + simd_width <= num_samples; i += simd_width) { - const auto a = _mm_load_ps(&src[i]); - const auto abs_a = _mm_andnot_ps(_mm_set1_ps(-0.0f), a); - peak_vec = _mm_max_ps(peak_vec, abs_a); - } - - // **关键优化:高效的SSE水平最大值归约操作** - // 使用shuffle指令序列避免内存存储+循环 - auto temp1 = _mm_shuffle_ps(peak_vec, peak_vec, _MM_SHUFFLE(2, 3, 0, 1)); // [y, x, w, z] - auto max1 = _mm_max_ps(peak_vec, temp1); // [max(x,y), max(x,y), max(z,w), max(z,w)] - auto temp2 = _mm_shuffle_ps(max1, max1, _MM_SHUFFLE(1, 0, 3, 2)); // [max(z,w), max(z,w), max(x,y), max(x,y)] - auto final_max = _mm_max_ps(max1, temp2); // [final_max, *, *, *] - float peak = _mm_cvtss_f32(final_max); - - // 处理剩余的标量样本 - for (; i < num_samples; ++i) { - float abs_sample = std::fabs(src[i]); - if (abs_sample > peak) { - peak = abs_sample; - } - } - - return peak; - } - - float calculate_peak_avx(const float* src, size_t num_samples) { - ASSERT_ALIGNED(src, ALIGNMENT_AVX); - - constexpr size_t simd_width = 8; // AVX每次处理8个float - constexpr size_t unroll_factor = 4; // 4路循环展开提高指令级并行性 - size_t i = 0; - auto peak_vec0 = _mm256_setzero_ps(); - auto peak_vec1 = _mm256_setzero_ps(); - auto peak_vec2 = _mm256_setzero_ps(); - auto peak_vec3 = _mm256_setzero_ps(); - - // 向量化处理(4路循环展开) - for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) { - // 加载4个向量 - const auto a0 = _mm256_load_ps(&src[i]); - const auto a1 = _mm256_load_ps(&src[i + 8]); - const auto a2 = _mm256_load_ps(&src[i + 16]); - const auto a3 = _mm256_load_ps(&src[i + 24]); - - // 并行计算绝对值 - const auto abs_a0 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a0); - const auto abs_a1 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a1); - const auto abs_a2 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a2); - const auto abs_a3 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a3); - - // 更新各自的峰值向量 - peak_vec0 = _mm256_max_ps(peak_vec0, abs_a0); - peak_vec1 = _mm256_max_ps(peak_vec1, abs_a1); - peak_vec2 = _mm256_max_ps(peak_vec2, abs_a2); - peak_vec3 = _mm256_max_ps(peak_vec3, abs_a3); - } - - // 合并4个峰值向量 - auto peak_vec = _mm256_max_ps(_mm256_max_ps(peak_vec0, peak_vec1), - _mm256_max_ps(peak_vec2, peak_vec3)); - - // 处理剩余的向量(单次处理) - for (; i + simd_width <= num_samples; i += simd_width) { - const auto a = _mm256_load_ps(&src[i]); - const auto abs_a = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a); - peak_vec = _mm256_max_ps(peak_vec, abs_a); - } - - // **关键优化:高效的AVX水平最大值归约操作** - // 提取高低128位并求最大值,然后使用SSE水平最大值 - auto low = _mm256_extractf128_ps(peak_vec, 0); - auto high = _mm256_extractf128_ps(peak_vec, 1); - auto max_lane = _mm_max_ps(low, high); - - // 在128位向量内进行水平最大值操作 - auto temp1 = _mm_shuffle_ps(max_lane, max_lane, _MM_SHUFFLE(2, 3, 0, 1)); - auto max1 = _mm_max_ps(max_lane, temp1); - auto temp2 = _mm_shuffle_ps(max1, max1, _MM_SHUFFLE(1, 0, 3, 2)); - auto final_max = _mm_max_ps(max1, temp2); - float peak = _mm_cvtss_f32(final_max); - - // 处理剩余的标量样本 - for (; i < num_samples; ++i) { - float abs_sample = std::fabs(src[i]); - if (abs_sample > peak) { - peak = abs_sample; - } - } - - return peak; - } - - float calculate_peak_avx512(const float* src, size_t num_samples) { - ASSERT_ALIGNED(src, ALIGNMENT_AVX512); - - constexpr size_t simd_width = 16; // AVX-512每次处理16个float - constexpr size_t unroll_factor = 4; // 4路循环展开提高指令级并行性 - size_t i = 0; - auto peak_vec0 = _mm512_setzero_ps(); - auto peak_vec1 = _mm512_setzero_ps(); - auto peak_vec2 = _mm512_setzero_ps(); - auto peak_vec3 = _mm512_setzero_ps(); - - // 向量化处理(4路循环展开) - for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) { - // 加载4个向量 - const auto a0 = _mm512_load_ps(&src[i]); - const auto a1 = _mm512_load_ps(&src[i + 16]); - const auto a2 = _mm512_load_ps(&src[i + 32]); - const auto a3 = _mm512_load_ps(&src[i + 48]); - - // 并行计算绝对值 - const auto abs_a0 = _mm512_andnot_ps(_mm512_set1_ps(-0.0f), a0); - const auto abs_a1 = _mm512_andnot_ps(_mm512_set1_ps(-0.0f), a1); - const auto abs_a2 = _mm512_andnot_ps(_mm512_set1_ps(-0.0f), a2); - const auto abs_a3 = _mm512_andnot_ps(_mm512_set1_ps(-0.0f), a3); - - // 更新各自的峰值向量 - peak_vec0 = _mm512_max_ps(peak_vec0, abs_a0); - peak_vec1 = _mm512_max_ps(peak_vec1, abs_a1); - peak_vec2 = _mm512_max_ps(peak_vec2, abs_a2); - peak_vec3 = _mm512_max_ps(peak_vec3, abs_a3); - } - - // 合并4个峰值向量 - auto peak_vec = _mm512_max_ps(_mm512_max_ps(peak_vec0, peak_vec1), - _mm512_max_ps(peak_vec2, peak_vec3)); - - // 处理剩余的向量(单次处理) - for (; i + simd_width <= num_samples; i += simd_width) { - const auto a = _mm512_load_ps(&src[i]); - const auto abs_a = _mm512_andnot_ps(_mm512_set1_ps(-0.0f), a); - peak_vec = _mm512_max_ps(peak_vec, abs_a); - } - - // **关键优化:高效的AVX-512水平最大值归约操作** - // 使用专用的reduce指令,性能最优 - float peak = _mm512_reduce_max_ps(peak_vec); - - // 处理剩余的标量样本 - for (; i < num_samples; ++i) { - float abs_sample = std::fabs(src[i]); - if (abs_sample > peak) { - peak = abs_sample; - } - } - - return peak; - } - - // 音频归一化函数实现 (SSE版本) - void normalize_audio_sse(const float* src, float* dst, float target_peak, size_t num_samples) { - ASSERT_ALIGNED(src, ALIGNMENT_SSE); - ASSERT_ALIGNED(dst, ALIGNMENT_SSE); - - // 边界情况处理 - if (num_samples == 0 || target_peak <= 0.0f) { - return; - } - - // 计算当前音频的峰值 - const float current_peak = calculate_peak_sse(src, num_samples); - - // 如果当前峰值过小,设置为静音或均匀值 - if (current_peak < 1e-10f) { - // 使用SSE优化的零填充 - constexpr size_t simd_width = 4; // SSE每次处理4个float - constexpr size_t unroll_factor = 4; - auto zero_vec = _mm_setzero_ps(); - size_t i = 0; - - // 向量化处理(4路环展开) - for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) { - _mm_store_ps(&dst[i], zero_vec); - _mm_store_ps(&dst[i + 4], zero_vec); - _mm_store_ps(&dst[i + 8], zero_vec); - _mm_store_ps(&dst[i + 12], zero_vec); - } - - // 处理剩余的向量(单次处理) - for (; i + simd_width <= num_samples; i += simd_width) { - _mm_store_ps(&dst[i], zero_vec); - } - - // 处理剩余的标量样本 - for (; i < num_samples; ++i) { - dst[i] = 0.0f; - } - return; - } - - // 计算归一化增益因子 - const float gain_factor = target_peak / current_peak; - - // 使用现有的apply_gain_sse函数应用增益 - apply_gain_sse(src, dst, gain_factor, num_samples); - } - - // 音频归一化函数实现 (AVX版本) - void normalize_audio_avx(const float* src, float* dst, float target_peak, size_t num_samples) { - ASSERT_ALIGNED(src, ALIGNMENT_AVX); - ASSERT_ALIGNED(dst, ALIGNMENT_AVX); - - // 边界情况处理 - if (num_samples == 0 || target_peak <= 0.0f) { - return; - } - - // 计算当前音频的峰值 - const float current_peak = calculate_peak_avx(src, num_samples); - - // 如果当前峰值过小,设置为静音或均匀值 - if (current_peak < 1e-10f) { - // 使用AVX优化的零填充 - constexpr size_t simd_width = 8; // AVX每次处理8个float - constexpr size_t unroll_factor = 4; - auto zero_vec = _mm256_setzero_ps(); - size_t i = 0; - - // 向量化处理(4路环展开) - for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) { - _mm256_store_ps(&dst[i], zero_vec); - _mm256_store_ps(&dst[i + 8], zero_vec); - _mm256_store_ps(&dst[i + 16], zero_vec); - _mm256_store_ps(&dst[i + 24], zero_vec); - } - - // 处理剩余的向量(单次处理) - for (; i + simd_width <= num_samples; i += simd_width) { - _mm256_store_ps(&dst[i], zero_vec); - } - - // 处理剩余的标量样本 - for (; i < num_samples; ++i) { - dst[i] = 0.0f; - } - return; - } - - // 计算归一化增益因子 - const float gain_factor = target_peak / current_peak; - - // 使用现有的apply_gain_avx函数应用增益 - apply_gain_avx(src, dst, gain_factor, num_samples); - } - - // 音频归一化函数实现 (AVX512版本) - void normalize_audio_avx512(const float* src, float* dst, float target_peak, size_t num_samples) { - ASSERT_ALIGNED(src, ALIGNMENT_AVX512); - ASSERT_ALIGNED(dst, ALIGNMENT_AVX512); - - // 边界情况处理 - if (num_samples == 0 || target_peak <= 0.0f) { - return; - } - - // 计算当前音频的峰值 - const float current_peak = calculate_peak_avx512(src, num_samples); - - // 如果当前峰值过小,设置为静音或均匀值 - if (current_peak < 1e-10f) { - // 使用AVX512优化的零填充 - constexpr size_t simd_width = 16; // AVX512每次处理16个float - constexpr size_t unroll_factor = 4; - auto zero_vec = _mm512_setzero_ps(); - size_t i = 0; - - // 向量化处理(4路环展开) - for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) { - _mm512_store_ps(&dst[i], zero_vec); - _mm512_store_ps(&dst[i + 16], zero_vec); - _mm512_store_ps(&dst[i + 32], zero_vec); - _mm512_store_ps(&dst[i + 48], zero_vec); - } - - // 处理剩余的向量(单次处理) - for (; i + simd_width <= num_samples; i += simd_width) { - _mm512_store_ps(&dst[i], zero_vec); - } - - // 处理剩余的标量样本 - for (; i < num_samples; ++i) { - dst[i] = 0.0f; - } - return; - } - - // 计算归一化增益因子 - const float gain_factor = target_peak / current_peak; - - // 使用现有的apply_gain_avx512函数应用增益 - apply_gain_avx512(src, dst, gain_factor, num_samples); - } - - // 立体声到单声道转换函数实现 (SSE版本) - void stereo_to_mono_sse(const float* stereo_src, float* mono_dst, size_t num_stereo_samples) { - ASSERT_ALIGNED(stereo_src, ALIGNMENT_SSE); - ASSERT_ALIGNED(mono_dst, ALIGNMENT_SSE); - - // 边界情况处理 - if (num_stereo_samples == 0) { - return; - } - - constexpr size_t simd_width = 4; // SSE每次处理4个float - constexpr size_t unroll_factor = 4; // 4路循环展开提高指令级并行性 - const auto half_vec = _mm_set1_ps(0.5f); // 用于取平均值 - size_t stereo_idx = 0; // 立体声索引(以样本对为单位) - size_t mono_idx = 0; // 单声道索引 - - // 向量化处理(4路循环展开) - // 每次处理4个单声道样本,需要读取8个立体声样本 - for (; stereo_idx + simd_width * 2 * unroll_factor <= num_stereo_samples * 2; - stereo_idx += simd_width * 2 * unroll_factor, mono_idx += simd_width * unroll_factor) { - // 加载8个立体声样本对 (16个float) - auto stereo0 = _mm_load_ps(&stereo_src[stereo_idx]); // [L0, R0, L1, R1] - auto stereo1 = _mm_load_ps(&stereo_src[stereo_idx + 4]); // [L2, R2, L3, R3] - auto stereo2 = _mm_load_ps(&stereo_src[stereo_idx + 8]); // [L4, R4, L5, R5] - auto stereo3 = _mm_load_ps(&stereo_src[stereo_idx + 12]); // [L6, R6, L7, R7] - auto stereo4 = _mm_load_ps(&stereo_src[stereo_idx + 16]); // [L8, R8, L9, R9] - auto stereo5 = _mm_load_ps(&stereo_src[stereo_idx + 20]); // [L10, R10, L11, R11] - auto stereo6 = _mm_load_ps(&stereo_src[stereo_idx + 24]); // [L12, R12, L13, R13] - auto stereo7 = _mm_load_ps(&stereo_src[stereo_idx + 28]); // [L14, R14, L15, R15] - - // 分离左右声道 - auto left0 = _mm_shuffle_ps(stereo0, stereo1, _MM_SHUFFLE(2, 0, 2, 0)); // [L0, L1, L2, L3] - auto right0 = _mm_shuffle_ps(stereo0, stereo1, _MM_SHUFFLE(3, 1, 3, 1)); // [R0, R1, R2, R3] - auto left1 = _mm_shuffle_ps(stereo2, stereo3, _MM_SHUFFLE(2, 0, 2, 0)); // [L4, L5, L6, L7] - auto right1 = _mm_shuffle_ps(stereo2, stereo3, _MM_SHUFFLE(3, 1, 3, 1)); // [R4, R5, R6, R7] - auto left2 = _mm_shuffle_ps(stereo4, stereo5, _MM_SHUFFLE(2, 0, 2, 0)); // [L8, L9, L10, L11] - auto right2 = _mm_shuffle_ps(stereo4, stereo5, _MM_SHUFFLE(3, 1, 3, 1)); // [R8, R9, R10, R11] - auto left3 = _mm_shuffle_ps(stereo6, stereo7, _MM_SHUFFLE(2, 0, 2, 0)); // [L12, L13, L14, L15] - auto right3 = _mm_shuffle_ps(stereo6, stereo7, _MM_SHUFFLE(3, 1, 3, 1)); // [R12, R13, R14, R15] - - // 计算单声道 = (左声道 + 右声道) / 2 - auto mono0 = _mm_mul_ps(_mm_add_ps(left0, right0), half_vec); - auto mono1 = _mm_mul_ps(_mm_add_ps(left1, right1), half_vec); - auto mono2 = _mm_mul_ps(_mm_add_ps(left2, right2), half_vec); - auto mono3 = _mm_mul_ps(_mm_add_ps(left3, right3), half_vec); - - // 存储结果 - _mm_store_ps(&mono_dst[mono_idx], mono0); - _mm_store_ps(&mono_dst[mono_idx + 4], mono1); - _mm_store_ps(&mono_dst[mono_idx + 8], mono2); - _mm_store_ps(&mono_dst[mono_idx + 12], mono3); - } - - // 处理剩余的样本对(标量处理) - for (size_t i = stereo_idx / 2; i < num_stereo_samples; ++i) { - const float left = stereo_src[i * 2]; - const float right = stereo_src[i * 2 + 1]; - mono_dst[i] = (left + right) * 0.5f; - } - } - - // 立体声到单声道转换函数实现 (AVX版本) - void stereo_to_mono_avx(const float* stereo_src, float* mono_dst, size_t num_stereo_samples) { - ASSERT_ALIGNED(stereo_src, ALIGNMENT_AVX); - ASSERT_ALIGNED(mono_dst, ALIGNMENT_AVX); - - // 边界情况处理 - if (num_stereo_samples == 0) { - return; - } - - constexpr size_t simd_width = 8; // AVX每次处理8个float - constexpr size_t unroll_factor = 4; // 4路循环展开提高指令级并行性 - const auto half_vec = _mm256_set1_ps(0.5f); // 用于取平均值 - size_t stereo_idx = 0; // 立体声索引(以样本对为单位) - size_t mono_idx = 0; // 单声道索引 - - // 向量化处理(4路循环展开) - // 每次处理8个单声道样本,需要读取16个立体声样本 - for (; stereo_idx + simd_width * 2 * unroll_factor <= num_stereo_samples * 2; - stereo_idx += simd_width * 2 * unroll_factor, mono_idx += simd_width * unroll_factor) { - // 加载16个立体声样本对 (32个float) - auto stereo0 = _mm256_load_ps(&stereo_src[stereo_idx]); // [L0,R0,L1,R1,L2,R2,L3,R3] - auto stereo1 = _mm256_load_ps(&stereo_src[stereo_idx + 8]); // [L4,R4,L5,R5,L6,R6,L7,R7] - auto stereo2 = _mm256_load_ps(&stereo_src[stereo_idx + 16]); // [L8,R8,L9,R9,L10,R10,L11,R11] - auto stereo3 = _mm256_load_ps(&stereo_src[stereo_idx + 24]); // [L12,R12,L13,R13,L14,R14,L15,R15] - auto stereo4 = _mm256_load_ps(&stereo_src[stereo_idx + 32]); // [L16,R16,L17,R17,L18,R18,L19,R19] - auto stereo5 = _mm256_load_ps(&stereo_src[stereo_idx + 40]); // [L20,R20,L21,R21,L22,R22,L23,R23] - auto stereo6 = _mm256_load_ps(&stereo_src[stereo_idx + 48]); // [L24,R24,L25,R25,L26,R26,L27,R27] - auto stereo7 = _mm256_load_ps(&stereo_src[stereo_idx + 56]); // [L28,R28,L29,R29,L30,R30,L31,R31] - - // 分离左右声道(使用shuffle和blend) - auto left0 = _mm256_shuffle_ps(stereo0, stereo1, _MM_SHUFFLE(2, 0, 2, 0)); // [L0,L1,L4,L5,L2,L3,L6,L7] - auto right0 = _mm256_shuffle_ps(stereo0, stereo1, _MM_SHUFFLE(3, 1, 3, 1)); // [R0,R1,R4,R5,R2,R3,R6,R7] - auto left1 = _mm256_shuffle_ps(stereo2, stereo3, _MM_SHUFFLE(2, 0, 2, 0)); - auto right1 = _mm256_shuffle_ps(stereo2, stereo3, _MM_SHUFFLE(3, 1, 3, 1)); - auto left2 = _mm256_shuffle_ps(stereo4, stereo5, _MM_SHUFFLE(2, 0, 2, 0)); - auto right2 = _mm256_shuffle_ps(stereo4, stereo5, _MM_SHUFFLE(3, 1, 3, 1)); - auto left3 = _mm256_shuffle_ps(stereo6, stereo7, _MM_SHUFFLE(2, 0, 2, 0)); - auto right3 = _mm256_shuffle_ps(stereo6, stereo7, _MM_SHUFFLE(3, 1, 3, 1)); - - // 重新排列以获得正确的顺序 - left0 = _mm256_permute2f128_ps(left0, left0, 0x01); // [L2,L3,L6,L7,L0,L1,L4,L5] - right0 = _mm256_permute2f128_ps(right0, right0, 0x01); // [R2,R3,R6,R7,R0,R1,R4,R5] - left1 = _mm256_permute2f128_ps(left1, left1, 0x01); - right1 = _mm256_permute2f128_ps(right1, right1, 0x01); - left2 = _mm256_permute2f128_ps(left2, left2, 0x01); - right2 = _mm256_permute2f128_ps(right2, right2, 0x01); - left3 = _mm256_permute2f128_ps(left3, left3, 0x01); - right3 = _mm256_permute2f128_ps(right3, right3, 0x01); - - // 计算单声道 = (左声道 + 右声道) / 2 - auto mono0 = _mm256_mul_ps(_mm256_add_ps(left0, right0), half_vec); - auto mono1 = _mm256_mul_ps(_mm256_add_ps(left1, right1), half_vec); - auto mono2 = _mm256_mul_ps(_mm256_add_ps(left2, right2), half_vec); - auto mono3 = _mm256_mul_ps(_mm256_add_ps(left3, right3), half_vec); - - // 存储结果 - _mm256_store_ps(&mono_dst[mono_idx], mono0); - _mm256_store_ps(&mono_dst[mono_idx + 8], mono1); - _mm256_store_ps(&mono_dst[mono_idx + 16], mono2); - _mm256_store_ps(&mono_dst[mono_idx + 24], mono3); - } - - // 处理剩余的样本对(标量处理) - for (size_t i = stereo_idx / 2; i < num_stereo_samples; ++i) { - const float left = stereo_src[i * 2]; - const float right = stereo_src[i * 2 + 1]; - mono_dst[i] = (left + right) * 0.5f; - } - } - - // 立体声到单声道转换函数实现 (AVX512版本) - void stereo_to_mono_avx512(const float* stereo_src, float* mono_dst, size_t num_stereo_samples) { - ASSERT_ALIGNED(stereo_src, ALIGNMENT_AVX512); - ASSERT_ALIGNED(mono_dst, ALIGNMENT_AVX512); - - // 边界情况处理 - if (num_stereo_samples == 0) { - return; - } - - constexpr size_t simd_width = 16; // AVX512每次处理16个float - constexpr size_t unroll_factor = 4; // 4路循环展开提高指令级并行性 - const auto half_vec = _mm512_set1_ps(0.5f); // 用于取平均值 - size_t stereo_idx = 0; // 立体声索引(以样本对为单位) - size_t mono_idx = 0; // 单声道索引 - - // 向量化处理(4路循环展开) - // 每次处理16个单声道样本,需要读取32个立体声样本 - for (; stereo_idx + simd_width * 2 * unroll_factor <= num_stereo_samples * 2; - stereo_idx += simd_width * 2 * unroll_factor, mono_idx += simd_width * unroll_factor) { - // 加载32个立体声样本对 (64个float) - auto stereo0 = _mm512_load_ps(&stereo_src[stereo_idx]); // 16个交错样本 - auto stereo1 = _mm512_load_ps(&stereo_src[stereo_idx + 16]); // 16个交错样本 - auto stereo2 = _mm512_load_ps(&stereo_src[stereo_idx + 32]); // 16个交错样本 - auto stereo3 = _mm512_load_ps(&stereo_src[stereo_idx + 48]); // 16个交错样本 - auto stereo4 = _mm512_load_ps(&stereo_src[stereo_idx + 64]); // 16个交错样本 - auto stereo5 = _mm512_load_ps(&stereo_src[stereo_idx + 80]); // 16个交错样本 - auto stereo6 = _mm512_load_ps(&stereo_src[stereo_idx + 96]); // 16个交错样本 - auto stereo7 = _mm512_load_ps(&stereo_src[stereo_idx + 112]); // 16个交错样本 - - // 使用AVX512的交替shuffle来分离左右声道 - const auto even_mask = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); - const auto odd_mask = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1); - - auto left0 = _mm512_permutex2var_ps(stereo0, even_mask, stereo1); - auto right0 = _mm512_permutex2var_ps(stereo0, odd_mask, stereo1); - auto left1 = _mm512_permutex2var_ps(stereo2, even_mask, stereo3); - auto right1 = _mm512_permutex2var_ps(stereo2, odd_mask, stereo3); - auto left2 = _mm512_permutex2var_ps(stereo4, even_mask, stereo5); - auto right2 = _mm512_permutex2var_ps(stereo4, odd_mask, stereo5); - auto left3 = _mm512_permutex2var_ps(stereo6, even_mask, stereo7); - auto right3 = _mm512_permutex2var_ps(stereo6, odd_mask, stereo7); - - // 计算单声道 = (左声道 + 右声道) / 2 - auto mono0 = _mm512_mul_ps(_mm512_add_ps(left0, right0), half_vec); - auto mono1 = _mm512_mul_ps(_mm512_add_ps(left1, right1), half_vec); - auto mono2 = _mm512_mul_ps(_mm512_add_ps(left2, right2), half_vec); - auto mono3 = _mm512_mul_ps(_mm512_add_ps(left3, right3), half_vec); - - // 存储结果 - _mm512_store_ps(&mono_dst[mono_idx], mono0); - _mm512_store_ps(&mono_dst[mono_idx + 16], mono1); - _mm512_store_ps(&mono_dst[mono_idx + 32], mono2); - _mm512_store_ps(&mono_dst[mono_idx + 48], mono3); - } - - // 处理剩余的样本对(标量处理) - for (size_t i = stereo_idx / 2; i < num_stereo_samples; ++i) { - const float left = stereo_src[i * 2]; - const float right = stereo_src[i * 2 + 1]; - mono_dst[i] = (left + right) * 0.5f; - } - } - - // 音频限幅函数实现 (SSE版本) - void limit_audio_sse(const float* src, float* dst, float threshold, float* limiter_state, float sample_rate, - size_t num_samples) { - ASSERT_ALIGNED(src, ALIGNMENT_SSE); - ASSERT_ALIGNED(dst, ALIGNMENT_SSE); - - // 边界情况处理 - if (num_samples == 0 || threshold <= 0.0f) { - return; - } - - constexpr size_t simd_width = 4; // SSE每次处理4个float - constexpr size_t unroll_factor = 4; // 4路循环展开提高指令级并行性 - constexpr float release_time = 0.05f; // 释放时间常数(秒),可根据实际需求调整 - float release_coeff = std::exp(-1.0f / (release_time * sample_rate)); // 释放系数 - - // 初始化限幅器状态,如果是首次调用,则从1.0开始 - float current_gain = limiter_state != nullptr ? *limiter_state : 1.0f; - - // 阈值和释放系数向量 - const auto threshold_vec = _mm_set1_ps(threshold); - const auto release_coeff_vec = _mm_set1_ps(release_coeff); - const auto one_vec = _mm_set1_ps(1.0f); - - size_t i = 0; - - // 向量化处理(4路循环展开) - for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) { - // 加载4个向量 - auto a0 = _mm_load_ps(&src[i]); - auto a1 = _mm_load_ps(&src[i + 4]); - auto a2 = _mm_load_ps(&src[i + 8]); - auto a3 = _mm_load_ps(&src[i + 12]); - - // 计算绝对值 - auto abs_a0 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a0); - auto abs_a1 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a1); - auto abs_a2 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a2); - auto abs_a3 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a3); - - // 找出最大值 - auto max_abs = _mm_max_ps(_mm_max_ps(abs_a0, abs_a1), - _mm_max_ps(abs_a2, abs_a3)); - - // 水平最大值归约 - auto temp1 = _mm_shuffle_ps(max_abs, max_abs, _MM_SHUFFLE(2, 3, 0, 1)); // [y, x, w, z] - auto max1 = _mm_max_ps(max_abs, temp1); // [max(x,y), max(x,y), max(z,w), max(z,w)] - auto temp2 = _mm_shuffle_ps(max1, max1, _MM_SHUFFLE(1, 0, 3, 2)); - // [max(z,w), max(z,w), max(x,y), max(x,y)] - auto final_max = _mm_max_ps(max1, temp2); // [final_max, *, *, *] - - // 提取水平最大值 - float max_sample = _mm_cvtss_f32(final_max); - - // 计算需要的增益以限制幅度 - float target_gain = max_sample > threshold ? threshold / max_sample : 1.0f; - - // 平滑增益变化(使用包络跟随器) - if (target_gain < current_gain) { - // 立即攻击 - current_gain = target_gain; - } - else { - // 缓慢释放 - current_gain = target_gain + (current_gain - target_gain) * release_coeff; - } - - // 将当前增益转换为向量 - auto gain_vec = _mm_set1_ps(current_gain); - - // 应用增益 - auto result0 = _mm_mul_ps(a0, gain_vec); - auto result1 = _mm_mul_ps(a1, gain_vec); - auto result2 = _mm_mul_ps(a2, gain_vec); - auto result3 = _mm_mul_ps(a3, gain_vec); - - // 存储结果 - _mm_store_ps(&dst[i], result0); - _mm_store_ps(&dst[i + 4], result1); - _mm_store_ps(&dst[i + 8], result2); - _mm_store_ps(&dst[i + 12], result3); - } - - // 处理剩余的向量(单次处理) - for (; i + simd_width <= num_samples; i += simd_width) { - auto a = _mm_load_ps(&src[i]); - auto abs_a = _mm_andnot_ps(_mm_set1_ps(-0.0f), a); - - // 找出最大值 - auto max_abs = abs_a; - - // 水平最大值归约 - auto temp1 = _mm_shuffle_ps(max_abs, max_abs, _MM_SHUFFLE(2, 3, 0, 1)); - auto max1 = _mm_max_ps(max_abs, temp1); - auto temp2 = _mm_shuffle_ps(max1, max1, _MM_SHUFFLE(1, 0, 3, 2)); - auto final_max = _mm_max_ps(max1, temp2); - - // 提取水平最大值 - float max_sample = _mm_cvtss_f32(final_max); - - // 计算需要的增益以限制幅度 - float target_gain = max_sample > threshold ? threshold / max_sample : 1.0f; - - // 平滑增益变化(使用包络跟随器) - if (target_gain < current_gain) { - // 立即攻击 - current_gain = target_gain; - } - else { - // 缓慢释放 - current_gain = target_gain + (current_gain - target_gain) * release_coeff; - } - - // 应用增益 - auto gain_vec = _mm_set1_ps(current_gain); - auto result = _mm_mul_ps(a, gain_vec); - _mm_store_ps(&dst[i], result); - } - - // 处理剩余的标量样本 - for (; i < num_samples; ++i) { - float sample = src[i]; - float abs_sample = std::fabs(sample); - - // 计算需要的增益以限制幅度 - float target_gain = abs_sample > threshold ? threshold / abs_sample : 1.0f; - - // 平滑增益变化(使用包络跟随器) - if (target_gain < current_gain) { - // 立即攻击 - current_gain = target_gain; - } - else { - // 缓慢释放 - current_gain = target_gain + (current_gain - target_gain) * release_coeff; - } - - // 应用增益 - dst[i] = sample * current_gain; - } - - // 更新限幅器状态 - if (limiter_state != nullptr) { - *limiter_state = current_gain; - } - } - - // 音频限幅函数实现 (AVX版本) - void limit_audio_avx(const float* src, float* dst, float threshold, float* limiter_state, float sample_rate, - size_t num_samples) { - ASSERT_ALIGNED(src, ALIGNMENT_AVX); - ASSERT_ALIGNED(dst, ALIGNMENT_AVX); - - // 边界情况处理 - if (num_samples == 0 || threshold <= 0.0f) { - return; - } - - constexpr size_t simd_width = 8; // AVX每次处理8个float - constexpr size_t unroll_factor = 4; // 4路循环展开提高指令级并行性 - constexpr float release_time = 0.05f; // 释放时间常数(秒),可根据实际需求调整 - float release_coeff = std::exp(-1.0f / (release_time * sample_rate)); // 释放系数 - - // 初始化限幅器状态,如果是首次调用,则从1.0开始 - float current_gain = limiter_state != nullptr ? *limiter_state : 1.0f; - - // 阈值和释放系数向量 - const auto threshold_vec = _mm256_set1_ps(threshold); - const auto release_coeff_vec = _mm256_set1_ps(release_coeff); - const auto one_vec = _mm256_set1_ps(1.0f); - - size_t i = 0; - - // 向量化处理(4路循环展开) - for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) { - // 加载4个向量 - auto a0 = _mm256_load_ps(&src[i]); - auto a1 = _mm256_load_ps(&src[i + 8]); - auto a2 = _mm256_load_ps(&src[i + 16]); - auto a3 = _mm256_load_ps(&src[i + 24]); - - // 计算绝对值 - auto abs_a0 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a0); - auto abs_a1 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a1); - auto abs_a2 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a2); - auto abs_a3 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a3); - - // 找出最大值 - auto max_abs = _mm256_max_ps(_mm256_max_ps(abs_a0, abs_a1), - _mm256_max_ps(abs_a2, abs_a3)); - - // AVX水平最大值归约 - // 提取高低128位并求最大值 - auto high = _mm256_extractf128_ps(max_abs, 1); - auto low = _mm256_extractf128_ps(max_abs, 0); - auto max_lane = _mm_max_ps(high, low); - - // 在128位向量内进行水平最大值操作 - auto temp1 = _mm_shuffle_ps(max_lane, max_lane, _MM_SHUFFLE(2, 3, 0, 1)); - auto max1 = _mm_max_ps(max_lane, temp1); - auto temp2 = _mm_shuffle_ps(max1, max1, _MM_SHUFFLE(1, 0, 3, 2)); - auto final_max = _mm_max_ps(max1, temp2); - - // 提取水平最大值 - float max_sample = _mm_cvtss_f32(final_max); - - // 计算需要的增益以限制幅度 - float target_gain = max_sample > threshold ? threshold / max_sample : 1.0f; - - // 平滑增益变化(使用包络跟随器) - if (target_gain < current_gain) { - // 立即攻击 - current_gain = target_gain; - } - else { - // 缓慢释放 - current_gain = target_gain + (current_gain - target_gain) * release_coeff; - } - - // 将当前增益转换为向量 - auto gain_vec = _mm256_set1_ps(current_gain); - - // 应用增益 - auto result0 = _mm256_mul_ps(a0, gain_vec); - auto result1 = _mm256_mul_ps(a1, gain_vec); - auto result2 = _mm256_mul_ps(a2, gain_vec); - auto result3 = _mm256_mul_ps(a3, gain_vec); - - // 存储结果 - _mm256_store_ps(&dst[i], result0); - _mm256_store_ps(&dst[i + 8], result1); - _mm256_store_ps(&dst[i + 16], result2); - _mm256_store_ps(&dst[i + 24], result3); - } - - // 处理剩余的向量(单次处理) - for (; i + simd_width <= num_samples; i += simd_width) { - auto a = _mm256_load_ps(&src[i]); - auto abs_a = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a); - - // 提取高低128位并求最大值 - auto high = _mm256_extractf128_ps(abs_a, 1); - auto low = _mm256_extractf128_ps(abs_a, 0); - auto max_lane = _mm_max_ps(high, low); - - // 在128位向量内进行水平最大值操作 - auto temp1 = _mm_shuffle_ps(max_lane, max_lane, _MM_SHUFFLE(2, 3, 0, 1)); - auto max1 = _mm_max_ps(max_lane, temp1); - auto temp2 = _mm_shuffle_ps(max1, max1, _MM_SHUFFLE(1, 0, 3, 2)); - auto final_max = _mm_max_ps(max1, temp2); - - // 提取水平最大值 - float max_sample = _mm_cvtss_f32(final_max); - - // 计算需要的增益以限制幅度 - float target_gain = max_sample > threshold ? threshold / max_sample : 1.0f; - - // 平滑增益变化(使用包络跟随器) - if (target_gain < current_gain) { - // 立即攻击 - current_gain = target_gain; - } - else { - // 缓慢释放 - current_gain = target_gain + (current_gain - target_gain) * release_coeff; - } - - // 应用增益 - auto gain_vec = _mm256_set1_ps(current_gain); - auto result = _mm256_mul_ps(a, gain_vec); - _mm256_store_ps(&dst[i], result); - } - - // 处理剩余的标量样本 - for (; i < num_samples; ++i) { - float sample = src[i]; - float abs_sample = std::fabs(sample); - - // 计算需要的增益以限制幅度 - float target_gain = abs_sample > threshold ? threshold / abs_sample : 1.0f; - - // 平滑增益变化(使用包络跟随器) - if (target_gain < current_gain) { - // 立即攻击 - current_gain = target_gain; - } - else { - // 缓慢释放 - current_gain = target_gain + (current_gain - target_gain) * release_coeff; - } - - // 应用增益 - dst[i] = sample * current_gain; - } - - // 更新限幅器状态 - if (limiter_state != nullptr) { - *limiter_state = current_gain; - } - } - - // 音频限幅函数实现 (AVX512版本) - void limit_audio_avx512(const float* src, float* dst, float threshold, float* limiter_state, float sample_rate, - size_t num_samples) { - ASSERT_ALIGNED(src, ALIGNMENT_AVX512); - ASSERT_ALIGNED(dst, ALIGNMENT_AVX512); - - // 边界情况处理 - if (num_samples == 0 || threshold <= 0.0f) { - return; - } - - constexpr size_t simd_width = 16; // AVX512每次处理16个float - constexpr size_t unroll_factor = 4; // 4路循环展开提高指令级并行性 - constexpr float release_time = 0.05f; // 释放时间常数(秒),可根据实际需求调整 - float release_coeff = std::exp(-1.0f / (release_time * sample_rate)); // 释放系数 - - // 初始化限幅器状态,如果是首次调用,则从1.0开始 - float current_gain = limiter_state != nullptr ? *limiter_state : 1.0f; - - // 阈值和释放系数向量 - const auto threshold_vec = _mm512_set1_ps(threshold); - const auto release_coeff_vec = _mm512_set1_ps(release_coeff); - const auto one_vec = _mm512_set1_ps(1.0f); - - size_t i = 0; - - // 向量化处理(4路循环展开) - for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) { - // 加载4个向量 - auto a0 = _mm512_load_ps(&src[i]); - auto a1 = _mm512_load_ps(&src[i + 16]); - auto a2 = _mm512_load_ps(&src[i + 32]); - auto a3 = _mm512_load_ps(&src[i + 48]); - - // 计算绝对值 - auto abs_a0 = _mm512_abs_ps(a0); // AVX512提供直接的绝对值指令 - auto abs_a1 = _mm512_abs_ps(a1); - auto abs_a2 = _mm512_abs_ps(a2); - auto abs_a3 = _mm512_abs_ps(a3); - - // 找出最大值 - auto max_abs = _mm512_max_ps(_mm512_max_ps(abs_a0, abs_a1), - _mm512_max_ps(abs_a2, abs_a3)); - - // AVX512水平最大值归约 - float max_sample = _mm512_reduce_max_ps(max_abs); // 使用专用reduce指令 - - // 计算需要的增益以限制幅度 - float target_gain = max_sample > threshold ? threshold / max_sample : 1.0f; - - // 平滑增益变化(使用包络跟随器) - if (target_gain < current_gain) { - // 立即攻击 - current_gain = target_gain; - } - else { - // 缓慢释放 - current_gain = target_gain + (current_gain - target_gain) * release_coeff; - } - - // 将当前增益转换为向量 - auto gain_vec = _mm512_set1_ps(current_gain); - - // 应用增益 - auto result0 = _mm512_mul_ps(a0, gain_vec); - auto result1 = _mm512_mul_ps(a1, gain_vec); - auto result2 = _mm512_mul_ps(a2, gain_vec); - auto result3 = _mm512_mul_ps(a3, gain_vec); - - // 存储结果 - _mm512_store_ps(&dst[i], result0); - _mm512_store_ps(&dst[i + 16], result1); - _mm512_store_ps(&dst[i + 32], result2); - _mm512_store_ps(&dst[i + 48], result3); - } - - // 处理剩余的向量(单次处理) - for (; i + simd_width <= num_samples; i += simd_width) { - auto a = _mm512_load_ps(&src[i]); - auto abs_a = _mm512_abs_ps(a); - - // 水平最大值归约 - float max_sample = _mm512_reduce_max_ps(abs_a); - - // 计算需要的增益以限制幅度 - float target_gain = max_sample > threshold ? threshold / max_sample : 1.0f; - - // 平滑增益变化(使用包络跟随器) - if (target_gain < current_gain) { - // 立即攻击 - current_gain = target_gain; - } - else { - // 缓慢释放 - current_gain = target_gain + (current_gain - target_gain) * release_coeff; - } - - // 应用增益 - auto gain_vec = _mm512_set1_ps(current_gain); - auto result = _mm512_mul_ps(a, gain_vec); - _mm512_store_ps(&dst[i], result); - } - - // 处理剩余的标量样本 - for (; i < num_samples; ++i) { - float sample = src[i]; - float abs_sample = std::fabs(sample); - - // 计算需要的增益以限制幅度 - float target_gain = abs_sample > threshold ? threshold / abs_sample : 1.0f; - - // 平滑增益变化(使用包络跟随器) - if (target_gain < current_gain) { - // 立即攻击 - current_gain = target_gain; - } - else { - // 缓慢释放 - current_gain = target_gain + (current_gain - target_gain) * release_coeff; - } - - // 应用增益 - dst[i] = sample * current_gain; - } - - // 更新限幅器状态 - if (limiter_state != nullptr) { - *limiter_state = current_gain; - } - } - - // 音频淡入淡出函数实现 (SSE版本) - void fade_audio_sse(const float* src, float* dst, size_t fade_in_samples, size_t fade_out_samples, - size_t num_samples) { - ASSERT_ALIGNED(src, ALIGNMENT_SSE); - ASSERT_ALIGNED(dst, ALIGNMENT_SSE); - - // 边界情况处理 - if (num_samples == 0) { - return; - } - - constexpr size_t simd_width = 4; // SSE每次处理4个float - constexpr size_t unroll_factor = 4; // 4路循环展开提高指令级并行性 - size_t i = 0; - - // 处理淡入部分 - if (fade_in_samples > 0) { - const float fade_in_step = 1.0f / static_cast(fade_in_samples); - const auto fade_in_step_vec = _mm_set1_ps(fade_in_step); - - // 向量化处理淡入(4路循环展开) - for (; i + simd_width * unroll_factor <= std::min(fade_in_samples, num_samples); i += simd_width * - unroll_factor) { - // 计算当前样本的淡入系数 - auto gain0 = _mm_set_ps((i + 3) * fade_in_step, (i + 2) * fade_in_step, (i + 1) * fade_in_step, - i * fade_in_step); - auto gain1 = _mm_set_ps((i + 7) * fade_in_step, (i + 6) * fade_in_step, (i + 5) * fade_in_step, - (i + 4) * fade_in_step); - auto gain2 = _mm_set_ps((i + 11) * fade_in_step, (i + 10) * fade_in_step, (i + 9) * fade_in_step, - (i + 8) * fade_in_step); - auto gain3 = _mm_set_ps((i + 15) * fade_in_step, (i + 14) * fade_in_step, (i + 13) * fade_in_step, - (i + 12) * fade_in_step); - - // 加载音频样本 - auto a0 = _mm_load_ps(&src[i]); - auto a1 = _mm_load_ps(&src[i + 4]); - auto a2 = _mm_load_ps(&src[i + 8]); - auto a3 = _mm_load_ps(&src[i + 12]); - - // 应用淡入增益 - auto result0 = _mm_mul_ps(a0, gain0); - auto result1 = _mm_mul_ps(a1, gain1); - auto result2 = _mm_mul_ps(a2, gain2); - auto result3 = _mm_mul_ps(a3, gain3); - - // 存储结果 - _mm_store_ps(&dst[i], result0); - _mm_store_ps(&dst[i + 4], result1); - _mm_store_ps(&dst[i + 8], result2); - _mm_store_ps(&dst[i + 12], result3); - } - - // 处理剩余的淡入样本(标量处理) - for (; i < std::min(fade_in_samples, num_samples); ++i) { - const float gain = static_cast(i) / static_cast(fade_in_samples); - dst[i] = src[i] * gain; - } - } - - // 处理中间部分(无淡入淡出,直接复制) - const size_t middle_start = fade_in_samples; - const size_t middle_end = num_samples > fade_out_samples ? num_samples - fade_out_samples : 0; - - if (middle_end > middle_start) { - // 使用SSE优化的直接复制 - for (size_t j = middle_start; j + simd_width * unroll_factor <= middle_end; j += simd_width * - unroll_factor) { - auto a0 = _mm_load_ps(&src[j]); - auto a1 = _mm_load_ps(&src[j + 4]); - auto a2 = _mm_load_ps(&src[j + 8]); - auto a3 = _mm_load_ps(&src[j + 12]); - - _mm_store_ps(&dst[j], a0); - _mm_store_ps(&dst[j + 4], a1); - _mm_store_ps(&dst[j + 8], a2); - _mm_store_ps(&dst[j + 12], a3); - } - - // 处理剩余的中间样本(标量处理) - for (size_t j = middle_start + ((middle_end - middle_start) / (simd_width * unroll_factor)) * (simd_width * - unroll_factor); j < middle_end; ++j) { - dst[j] = src[j]; - } - } - - // 处理淡出部分 - if (fade_out_samples > 0 && num_samples > fade_out_samples) { - const size_t fade_out_start = num_samples - fade_out_samples; - const float fade_out_step = 1.0f / static_cast(fade_out_samples); - - // 向量化处理淡出(4路循环展开) - for (size_t j = fade_out_start; j + simd_width * unroll_factor <= num_samples; j += simd_width * - unroll_factor) { - // 计算当前样本的淡出系数(从1递减到0) - const size_t fade_out_offset = j - fade_out_start; - auto gain0 = _mm_set_ps(1.0f - (fade_out_offset + 3) * fade_out_step, - 1.0f - (fade_out_offset + 2) * fade_out_step, - 1.0f - (fade_out_offset + 1) * fade_out_step, - 1.0f - fade_out_offset * fade_out_step); - auto gain1 = _mm_set_ps(1.0f - (fade_out_offset + 7) * fade_out_step, - 1.0f - (fade_out_offset + 6) * fade_out_step, - 1.0f - (fade_out_offset + 5) * fade_out_step, - 1.0f - (fade_out_offset + 4) * fade_out_step); - auto gain2 = _mm_set_ps(1.0f - (fade_out_offset + 11) * fade_out_step, - 1.0f - (fade_out_offset + 10) * fade_out_step, - 1.0f - (fade_out_offset + 9) * fade_out_step, - 1.0f - (fade_out_offset + 8) * fade_out_step); - auto gain3 = _mm_set_ps(1.0f - (fade_out_offset + 15) * fade_out_step, - 1.0f - (fade_out_offset + 14) * fade_out_step, - 1.0f - (fade_out_offset + 13) * fade_out_step, - 1.0f - (fade_out_offset + 12) * fade_out_step); - - // 加载音频样本 - auto a0 = _mm_load_ps(&src[j]); - auto a1 = _mm_load_ps(&src[j + 4]); - auto a2 = _mm_load_ps(&src[j + 8]); - auto a3 = _mm_load_ps(&src[j + 12]); - - // 应用淡出增益 - auto result0 = _mm_mul_ps(a0, gain0); - auto result1 = _mm_mul_ps(a1, gain1); - auto result2 = _mm_mul_ps(a2, gain2); - auto result3 = _mm_mul_ps(a3, gain3); - - // 存储结果 - _mm_store_ps(&dst[j], result0); - _mm_store_ps(&dst[j + 4], result1); - _mm_store_ps(&dst[j + 8], result2); - _mm_store_ps(&dst[j + 12], result3); - } - - // 处理剩余的淡出样本(标量处理) - for (size_t j = fade_out_start + ((fade_out_samples / (simd_width * unroll_factor)) * (simd_width * - unroll_factor)); j < num_samples; ++j) { - const size_t fade_out_offset = j - fade_out_start; - const float gain = 1.0f - static_cast(fade_out_offset) / static_cast(fade_out_samples); - dst[j] = src[j] * gain; - } - } - } - - // 音频淡入淡出函数实现 (AVX版本) - void fade_audio_avx(const float* src, float* dst, size_t fade_in_samples, size_t fade_out_samples, - size_t num_samples) { - ASSERT_ALIGNED(src, ALIGNMENT_AVX); - ASSERT_ALIGNED(dst, ALIGNMENT_AVX); - - // 边界情况处理 - if (num_samples == 0) { - return; - } - - constexpr size_t simd_width = 8; // AVX每次处理8个float - constexpr size_t unroll_factor = 4; // 4路循环展开提高指令级并行性 - size_t i = 0; - - // 处理淡入部分 - if (fade_in_samples > 0) { - const float fade_in_step = 1.0f / static_cast(fade_in_samples); - - // 向量化处理淡入(4路循环展开) - for (; i + simd_width * unroll_factor <= std::min(fade_in_samples, num_samples); i += simd_width * - unroll_factor) { - // 计算当前样本的淡入系数 - auto gain0 = _mm256_set_ps((i + 7) * fade_in_step, (i + 6) * fade_in_step, (i + 5) * fade_in_step, - (i + 4) * fade_in_step, - (i + 3) * fade_in_step, (i + 2) * fade_in_step, (i + 1) * fade_in_step, - i * fade_in_step); - auto gain1 = _mm256_set_ps((i + 15) * fade_in_step, (i + 14) * fade_in_step, (i + 13) * fade_in_step, - (i + 12) * fade_in_step, - (i + 11) * fade_in_step, (i + 10) * fade_in_step, (i + 9) * fade_in_step, - (i + 8) * fade_in_step); - auto gain2 = _mm256_set_ps((i + 23) * fade_in_step, (i + 22) * fade_in_step, (i + 21) * fade_in_step, - (i + 20) * fade_in_step, - (i + 19) * fade_in_step, (i + 18) * fade_in_step, (i + 17) * fade_in_step, - (i + 16) * fade_in_step); - auto gain3 = _mm256_set_ps((i + 31) * fade_in_step, (i + 30) * fade_in_step, (i + 29) * fade_in_step, - (i + 28) * fade_in_step, - (i + 27) * fade_in_step, (i + 26) * fade_in_step, (i + 25) * fade_in_step, - (i + 24) * fade_in_step); - - // 加载音频样本 - auto a0 = _mm256_load_ps(&src[i]); - auto a1 = _mm256_load_ps(&src[i + 8]); - auto a2 = _mm256_load_ps(&src[i + 16]); - auto a3 = _mm256_load_ps(&src[i + 24]); - - // 应用淡入增益 - auto result0 = _mm256_mul_ps(a0, gain0); - auto result1 = _mm256_mul_ps(a1, gain1); - auto result2 = _mm256_mul_ps(a2, gain2); - auto result3 = _mm256_mul_ps(a3, gain3); - - // 存储结果 - _mm256_store_ps(&dst[i], result0); - _mm256_store_ps(&dst[i + 8], result1); - _mm256_store_ps(&dst[i + 16], result2); - _mm256_store_ps(&dst[i + 24], result3); - } - - // 处理剩余的淡入样本(标量处理) - for (; i < std::min(fade_in_samples, num_samples); ++i) { - const float gain = static_cast(i) / static_cast(fade_in_samples); - dst[i] = src[i] * gain; - } - } - - // 处理中间部分(无淡入淡出,直接复制) - const size_t middle_start = fade_in_samples; - const size_t middle_end = num_samples > fade_out_samples ? num_samples - fade_out_samples : 0; - - if (middle_end > middle_start) { - // 使用AVX优化的直接复制 - for (size_t j = middle_start; j + simd_width * unroll_factor <= middle_end; j += simd_width * - unroll_factor) { - auto a0 = _mm256_load_ps(&src[j]); - auto a1 = _mm256_load_ps(&src[j + 8]); - auto a2 = _mm256_load_ps(&src[j + 16]); - auto a3 = _mm256_load_ps(&src[j + 24]); - - _mm256_store_ps(&dst[j], a0); - _mm256_store_ps(&dst[j + 8], a1); - _mm256_store_ps(&dst[j + 16], a2); - _mm256_store_ps(&dst[j + 24], a3); - } - - // 处理剩余的中间样本(标量处理) - for (size_t j = middle_start + ((middle_end - middle_start) / (simd_width * unroll_factor)) * (simd_width * - unroll_factor); j < middle_end; ++j) { - dst[j] = src[j]; - } - } - - // 处理淡出部分 - if (fade_out_samples > 0 && num_samples > fade_out_samples) { - const size_t fade_out_start = num_samples - fade_out_samples; - const float fade_out_step = 1.0f / static_cast(fade_out_samples); - - // 向量化处理淡出(4路循环展开) - for (size_t j = fade_out_start; j + simd_width * unroll_factor <= num_samples; j += simd_width * - unroll_factor) { - // 计算当前样本的淡出系数(从1递减到0) - const size_t fade_out_offset = j - fade_out_start; - auto gain0 = _mm256_set_ps(1.0f - (fade_out_offset + 7) * fade_out_step, - 1.0f - (fade_out_offset + 6) * fade_out_step, - 1.0f - (fade_out_offset + 5) * fade_out_step, - 1.0f - (fade_out_offset + 4) * fade_out_step, - 1.0f - (fade_out_offset + 3) * fade_out_step, - 1.0f - (fade_out_offset + 2) * fade_out_step, - 1.0f - (fade_out_offset + 1) * fade_out_step, - 1.0f - fade_out_offset * fade_out_step); - auto gain1 = _mm256_set_ps(1.0f - (fade_out_offset + 15) * fade_out_step, - 1.0f - (fade_out_offset + 14) * fade_out_step, - 1.0f - (fade_out_offset + 13) * fade_out_step, - 1.0f - (fade_out_offset + 12) * fade_out_step, - 1.0f - (fade_out_offset + 11) * fade_out_step, - 1.0f - (fade_out_offset + 10) * fade_out_step, - 1.0f - (fade_out_offset + 9) * fade_out_step, - 1.0f - (fade_out_offset + 8) * fade_out_step); - auto gain2 = _mm256_set_ps(1.0f - (fade_out_offset + 23) * fade_out_step, - 1.0f - (fade_out_offset + 22) * fade_out_step, - 1.0f - (fade_out_offset + 21) * fade_out_step, - 1.0f - (fade_out_offset + 20) * fade_out_step, - 1.0f - (fade_out_offset + 19) * fade_out_step, - 1.0f - (fade_out_offset + 18) * fade_out_step, - 1.0f - (fade_out_offset + 17) * fade_out_step, - 1.0f - (fade_out_offset + 16) * fade_out_step); - auto gain3 = _mm256_set_ps(1.0f - (fade_out_offset + 31) * fade_out_step, - 1.0f - (fade_out_offset + 30) * fade_out_step, - 1.0f - (fade_out_offset + 29) * fade_out_step, - 1.0f - (fade_out_offset + 28) * fade_out_step, - 1.0f - (fade_out_offset + 27) * fade_out_step, - 1.0f - (fade_out_offset + 26) * fade_out_step, - 1.0f - (fade_out_offset + 25) * fade_out_step, - 1.0f - (fade_out_offset + 24) * fade_out_step); - - // 加载音频样本 - auto a0 = _mm256_load_ps(&src[j]); - auto a1 = _mm256_load_ps(&src[j + 8]); - auto a2 = _mm256_load_ps(&src[j + 16]); - auto a3 = _mm256_load_ps(&src[j + 24]); - - // 应用淡出增益 - auto result0 = _mm256_mul_ps(a0, gain0); - auto result1 = _mm256_mul_ps(a1, gain1); - auto result2 = _mm256_mul_ps(a2, gain2); - auto result3 = _mm256_mul_ps(a3, gain3); - - // 存储结果 - _mm256_store_ps(&dst[j], result0); - _mm256_store_ps(&dst[j + 8], result1); - _mm256_store_ps(&dst[j + 16], result2); - _mm256_store_ps(&dst[j + 24], result3); - } - - // 处理剩余的淡出样本(标量处理) - for (size_t j = fade_out_start + ((fade_out_samples / (simd_width * unroll_factor)) * (simd_width * - unroll_factor)); j < num_samples; ++j) { - const size_t fade_out_offset = j - fade_out_start; - const float gain = 1.0f - static_cast(fade_out_offset) / static_cast(fade_out_samples); - dst[j] = src[j] * gain; - } - } - } - - // 音频淡入淡出函数实现 (AVX512版本) - void fade_audio_avx512(const float* src, float* dst, size_t fade_in_samples, size_t fade_out_samples, - size_t num_samples) { - ASSERT_ALIGNED(src, ALIGNMENT_AVX512); - ASSERT_ALIGNED(dst, ALIGNMENT_AVX512); - - // 边界情况处理 - if (num_samples == 0) { - return; - } - - constexpr size_t simd_width = 16; // AVX512每次处理16个float - constexpr size_t unroll_factor = 4; // 4路循环展开提高指令级并行性 - size_t i = 0; - - // 处理淡入部分 - if (fade_in_samples > 0) { - const float fade_in_step = 1.0f / static_cast(fade_in_samples); - - // 向量化处理淡入(4路循环展开) - for (; i + simd_width * unroll_factor <= std::min(fade_in_samples, num_samples); i += simd_width * - unroll_factor) { - // 计算当前样本的淡入系数 - auto gain0 = _mm512_set_ps((i + 15) * fade_in_step, (i + 14) * fade_in_step, (i + 13) * fade_in_step, - (i + 12) * fade_in_step, - (i + 11) * fade_in_step, (i + 10) * fade_in_step, (i + 9) * fade_in_step, - (i + 8) * fade_in_step, - (i + 7) * fade_in_step, (i + 6) * fade_in_step, (i + 5) * fade_in_step, - (i + 4) * fade_in_step, - (i + 3) * fade_in_step, (i + 2) * fade_in_step, (i + 1) * fade_in_step, - i * fade_in_step); - auto gain1 = _mm512_set_ps((i + 31) * fade_in_step, (i + 30) * fade_in_step, (i + 29) * fade_in_step, - (i + 28) * fade_in_step, - (i + 27) * fade_in_step, (i + 26) * fade_in_step, (i + 25) * fade_in_step, - (i + 24) * fade_in_step, - (i + 23) * fade_in_step, (i + 22) * fade_in_step, (i + 21) * fade_in_step, - (i + 20) * fade_in_step, - (i + 19) * fade_in_step, (i + 18) * fade_in_step, (i + 17) * fade_in_step, - (i + 16) * fade_in_step); - auto gain2 = _mm512_set_ps((i + 47) * fade_in_step, (i + 46) * fade_in_step, (i + 45) * fade_in_step, - (i + 44) * fade_in_step, - (i + 43) * fade_in_step, (i + 42) * fade_in_step, (i + 41) * fade_in_step, - (i + 40) * fade_in_step, - (i + 39) * fade_in_step, (i + 38) * fade_in_step, (i + 37) * fade_in_step, - (i + 36) * fade_in_step, - (i + 35) * fade_in_step, (i + 34) * fade_in_step, (i + 33) * fade_in_step, - (i + 32) * fade_in_step); - auto gain3 = _mm512_set_ps((i + 63) * fade_in_step, (i + 62) * fade_in_step, (i + 61) * fade_in_step, - (i + 60) * fade_in_step, - (i + 59) * fade_in_step, (i + 58) * fade_in_step, (i + 57) * fade_in_step, - (i + 56) * fade_in_step, - (i + 55) * fade_in_step, (i + 54) * fade_in_step, (i + 53) * fade_in_step, - (i + 52) * fade_in_step, - (i + 51) * fade_in_step, (i + 50) * fade_in_step, (i + 49) * fade_in_step, - (i + 48) * fade_in_step); - - // 加载音频样本 - auto a0 = _mm512_load_ps(&src[i]); - auto a1 = _mm512_load_ps(&src[i + 16]); - auto a2 = _mm512_load_ps(&src[i + 32]); - auto a3 = _mm512_load_ps(&src[i + 48]); - - // 应用淡入增益 - auto result0 = _mm512_mul_ps(a0, gain0); - auto result1 = _mm512_mul_ps(a1, gain1); - auto result2 = _mm512_mul_ps(a2, gain2); - auto result3 = _mm512_mul_ps(a3, gain3); - - // 存储结果 - _mm512_store_ps(&dst[i], result0); - _mm512_store_ps(&dst[i + 16], result1); - _mm512_store_ps(&dst[i + 32], result2); - _mm512_store_ps(&dst[i + 48], result3); - } - - // 处理剩余的淡入样本(标量处理) - for (; i < std::min(fade_in_samples, num_samples); ++i) { - const float gain = static_cast(i) / static_cast(fade_in_samples); - dst[i] = src[i] * gain; - } - } - - // 处理中间部分(无淡入淡出,直接复制) - const size_t middle_start = fade_in_samples; - const size_t middle_end = num_samples > fade_out_samples ? num_samples - fade_out_samples : 0; - - if (middle_end > middle_start) { - // 使用AVX512优化的直接复制 - for (size_t j = middle_start; j + simd_width * unroll_factor <= middle_end; j += simd_width * - unroll_factor) { - auto a0 = _mm512_load_ps(&src[j]); - auto a1 = _mm512_load_ps(&src[j + 16]); - auto a2 = _mm512_load_ps(&src[j + 32]); - auto a3 = _mm512_load_ps(&src[j + 48]); - - _mm512_store_ps(&dst[j], a0); - _mm512_store_ps(&dst[j + 16], a1); - _mm512_store_ps(&dst[j + 32], a2); - _mm512_store_ps(&dst[j + 48], a3); - } - - // 处理剩余的中间样本(标量处理) - for (size_t j = middle_start + ((middle_end - middle_start) / (simd_width * unroll_factor)) * (simd_width * - unroll_factor); j < middle_end; ++j) { - dst[j] = src[j]; - } - } - - // 处理淡出部分 - if (fade_out_samples > 0 && num_samples > fade_out_samples) { - const size_t fade_out_start = num_samples - fade_out_samples; - const float fade_out_step = 1.0f / static_cast(fade_out_samples); - - // 向量化处理淡出(4路循环展开) - for (size_t j = fade_out_start; j + simd_width * unroll_factor <= num_samples; j += simd_width * - unroll_factor) { - // 计算当前样本的淡出系数(从1递减到0) - const size_t fade_out_offset = j - fade_out_start; - auto gain0 = _mm512_set_ps(1.0f - (fade_out_offset + 15) * fade_out_step, - 1.0f - (fade_out_offset + 14) * fade_out_step, - 1.0f - (fade_out_offset + 13) * fade_out_step, - 1.0f - (fade_out_offset + 12) * fade_out_step, - 1.0f - (fade_out_offset + 11) * fade_out_step, - 1.0f - (fade_out_offset + 10) * fade_out_step, - 1.0f - (fade_out_offset + 9) * fade_out_step, - 1.0f - (fade_out_offset + 8) * fade_out_step, - 1.0f - (fade_out_offset + 7) * fade_out_step, - 1.0f - (fade_out_offset + 6) * fade_out_step, - 1.0f - (fade_out_offset + 5) * fade_out_step, - 1.0f - (fade_out_offset + 4) * fade_out_step, - 1.0f - (fade_out_offset + 3) * fade_out_step, - 1.0f - (fade_out_offset + 2) * fade_out_step, - 1.0f - (fade_out_offset + 1) * fade_out_step, - 1.0f - fade_out_offset * fade_out_step); - auto gain1 = _mm512_set_ps(1.0f - (fade_out_offset + 31) * fade_out_step, - 1.0f - (fade_out_offset + 30) * fade_out_step, - 1.0f - (fade_out_offset + 29) * fade_out_step, - 1.0f - (fade_out_offset + 28) * fade_out_step, - 1.0f - (fade_out_offset + 27) * fade_out_step, - 1.0f - (fade_out_offset + 26) * fade_out_step, - 1.0f - (fade_out_offset + 25) * fade_out_step, - 1.0f - (fade_out_offset + 24) * fade_out_step, - 1.0f - (fade_out_offset + 23) * fade_out_step, - 1.0f - (fade_out_offset + 22) * fade_out_step, - 1.0f - (fade_out_offset + 21) * fade_out_step, - 1.0f - (fade_out_offset + 20) * fade_out_step, - 1.0f - (fade_out_offset + 19) * fade_out_step, - 1.0f - (fade_out_offset + 18) * fade_out_step, - 1.0f - (fade_out_offset + 17) * fade_out_step, - 1.0f - (fade_out_offset + 16) * fade_out_step); - auto gain2 = _mm512_set_ps(1.0f - (fade_out_offset + 47) * fade_out_step, - 1.0f - (fade_out_offset + 46) * fade_out_step, - 1.0f - (fade_out_offset + 45) * fade_out_step, - 1.0f - (fade_out_offset + 44) * fade_out_step, - 1.0f - (fade_out_offset + 43) * fade_out_step, - 1.0f - (fade_out_offset + 42) * fade_out_step, - 1.0f - (fade_out_offset + 41) * fade_out_step, - 1.0f - (fade_out_offset + 40) * fade_out_step, - 1.0f - (fade_out_offset + 39) * fade_out_step, - 1.0f - (fade_out_offset + 38) * fade_out_step, - 1.0f - (fade_out_offset + 37) * fade_out_step, - 1.0f - (fade_out_offset + 36) * fade_out_step, - 1.0f - (fade_out_offset + 35) * fade_out_step, - 1.0f - (fade_out_offset + 34) * fade_out_step, - 1.0f - (fade_out_offset + 33) * fade_out_step, - 1.0f - (fade_out_offset + 32) * fade_out_step); - auto gain3 = _mm512_set_ps(1.0f - (fade_out_offset + 63) * fade_out_step, - 1.0f - (fade_out_offset + 62) * fade_out_step, - 1.0f - (fade_out_offset + 61) * fade_out_step, - 1.0f - (fade_out_offset + 60) * fade_out_step, - 1.0f - (fade_out_offset + 59) * fade_out_step, - 1.0f - (fade_out_offset + 58) * fade_out_step, - 1.0f - (fade_out_offset + 57) * fade_out_step, - 1.0f - (fade_out_offset + 56) * fade_out_step, - 1.0f - (fade_out_offset + 55) * fade_out_step, - 1.0f - (fade_out_offset + 54) * fade_out_step, - 1.0f - (fade_out_offset + 53) * fade_out_step, - 1.0f - (fade_out_offset + 52) * fade_out_step, - 1.0f - (fade_out_offset + 51) * fade_out_step, - 1.0f - (fade_out_offset + 50) * fade_out_step, - 1.0f - (fade_out_offset + 49) * fade_out_step, - 1.0f - (fade_out_offset + 48) * fade_out_step); - - // 加载音频样本 - auto a0 = _mm512_load_ps(&src[j]); - auto a1 = _mm512_load_ps(&src[j + 16]); - auto a2 = _mm512_load_ps(&src[j + 32]); - auto a3 = _mm512_load_ps(&src[j + 48]); - - // 应用淡出增益 - auto result0 = _mm512_mul_ps(a0, gain0); - auto result1 = _mm512_mul_ps(a1, gain1); - auto result2 = _mm512_mul_ps(a2, gain2); - auto result3 = _mm512_mul_ps(a3, gain3); - - // 存储结果 - _mm512_store_ps(&dst[j], result0); - _mm512_store_ps(&dst[j + 16], result1); - _mm512_store_ps(&dst[j + 32], result2); - _mm512_store_ps(&dst[j + 48], result3); - } - - // 处理剩余的淡出样本(标量处理) - for (size_t j = fade_out_start + ((fade_out_samples / (simd_width * unroll_factor)) * (simd_width * - unroll_factor)); j < num_samples; ++j) { - const size_t fade_out_offset = j - fade_out_start; - const float gain = 1.0f - static_cast(fade_out_offset) / static_cast(fade_out_samples); - dst[j] = src[j] * gain; - } - } - } - - - // 简单均衡器函数实现 (SSE版本) - void simple_eq_sse(const float* src, float* dst, float low_gain, float mid_gain, float high_gain, float* eq_state, - size_t num_samples) { - ASSERT_ALIGNED(src, ALIGNMENT_SSE); - ASSERT_ALIGNED(dst, ALIGNMENT_SSE); - - // 边界情况处理 - if (num_samples == 0) { - return; - } - - constexpr size_t simd_width = 4; // SSE每次处理4个float - constexpr size_t unroll_factor = 4; // 4路循环展开提高指令级并行性 - - // 简化的频率分割系数 - // 低频cutoff约为500Hz,高频cutoff约为5000Hz (假设44.1kHz采样率) - constexpr float low_cutoff = 0.02f; // 简化的低通滤波器系数 - constexpr float high_cutoff = 0.1f; // 简化的高通滤波器系数 - constexpr float mid_factor = 0.7f; // 中频保持系数 - - // 初始化EQ状态 - float low_state = eq_state != nullptr ? eq_state[0] : 0.0f; - float high_state = eq_state != nullptr ? eq_state[1] : 0.0f; - - // 创建增益向量 - const auto low_gain_vec = _mm_set1_ps(low_gain); - const auto mid_gain_vec = _mm_set1_ps(mid_gain); - const auto high_gain_vec = _mm_set1_ps(high_gain); - const auto low_cutoff_vec = _mm_set1_ps(low_cutoff); - const auto high_cutoff_vec = _mm_set1_ps(high_cutoff); - const auto mid_factor_vec = _mm_set1_ps(mid_factor); - const auto one_minus_low_cutoff_vec = _mm_set1_ps(1.0f - low_cutoff); - const auto one_minus_high_cutoff_vec = _mm_set1_ps(1.0f - high_cutoff); - - size_t i = 0; - - // 向量化处理(4路循环展开) - for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) { - // 加载4个向量 - auto input0 = _mm_load_ps(&src[i]); - auto input1 = _mm_load_ps(&src[i + 4]); - auto input2 = _mm_load_ps(&src[i + 8]); - auto input3 = _mm_load_ps(&src[i + 12]); - - // 简化的低通滤波器实现(一阶IIR) - // low_output = low_cutoff * input + (1 - low_cutoff) * low_state - auto low_state_vec = _mm_set1_ps(low_state); - auto low0 = _mm_add_ps(_mm_mul_ps(input0, low_cutoff_vec), - _mm_mul_ps(low_state_vec, one_minus_low_cutoff_vec)); - auto low1 = _mm_add_ps(_mm_mul_ps(input1, low_cutoff_vec), _mm_mul_ps(low0, one_minus_low_cutoff_vec)); - auto low2 = _mm_add_ps(_mm_mul_ps(input2, low_cutoff_vec), _mm_mul_ps(low1, one_minus_low_cutoff_vec)); - auto low3 = _mm_add_ps(_mm_mul_ps(input3, low_cutoff_vec), _mm_mul_ps(low2, one_minus_low_cutoff_vec)); - - // 简化的高通滤波器实现(输入减去低通) - // high_output = input - low_output - auto high0 = _mm_sub_ps(input0, low0); - auto high1 = _mm_sub_ps(input1, low1); - auto high2 = _mm_sub_ps(input2, low2); - auto high3 = _mm_sub_ps(input3, low3); - - // 进一步高频处理 - auto high_state_vec = _mm_set1_ps(high_state); - high0 = _mm_add_ps(_mm_mul_ps(high0, high_cutoff_vec), - _mm_mul_ps(high_state_vec, one_minus_high_cutoff_vec)); - high1 = _mm_add_ps(_mm_mul_ps(high1, high_cutoff_vec), _mm_mul_ps(high0, one_minus_high_cutoff_vec)); - high2 = _mm_add_ps(_mm_mul_ps(high2, high_cutoff_vec), _mm_mul_ps(high1, one_minus_high_cutoff_vec)); - high3 = _mm_add_ps(_mm_mul_ps(high3, high_cutoff_vec), _mm_mul_ps(high2, one_minus_high_cutoff_vec)); - - // 中频:原始信号减去低频和高频 - auto mid0 = _mm_mul_ps(_mm_sub_ps(_mm_sub_ps(input0, low0), high0), mid_factor_vec); - auto mid1 = _mm_mul_ps(_mm_sub_ps(_mm_sub_ps(input1, low1), high1), mid_factor_vec); - auto mid2 = _mm_mul_ps(_mm_sub_ps(_mm_sub_ps(input2, low2), high2), mid_factor_vec); - auto mid3 = _mm_mul_ps(_mm_sub_ps(_mm_sub_ps(input3, low3), high3), mid_factor_vec); - - // 应用增益并混合 - auto result0 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(low0, low_gain_vec), _mm_mul_ps(mid0, mid_gain_vec)), - _mm_mul_ps(high0, high_gain_vec)); - auto result1 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(low1, low_gain_vec), _mm_mul_ps(mid1, mid_gain_vec)), - _mm_mul_ps(high1, high_gain_vec)); - auto result2 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(low2, low_gain_vec), _mm_mul_ps(mid2, mid_gain_vec)), - _mm_mul_ps(high2, high_gain_vec)); - auto result3 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(low3, low_gain_vec), _mm_mul_ps(mid3, mid_gain_vec)), - _mm_mul_ps(high3, high_gain_vec)); - - // 存储结果 - _mm_store_ps(&dst[i], result0); - _mm_store_ps(&dst[i + 4], result1); - _mm_store_ps(&dst[i + 8], result2); - _mm_store_ps(&dst[i + 12], result3); - - // 更新状态(使用最后一个元素) - low_state = _mm_cvtss_f32(_mm_shuffle_ps(low3, low3, _MM_SHUFFLE(3, 3, 3, 3))); - high_state = _mm_cvtss_f32(_mm_shuffle_ps(high3, high3, _MM_SHUFFLE(3, 3, 3, 3))); - } - - // 处理剩余的标量样本 - for (; i < num_samples; ++i) { - float input = src[i]; - - // 低通滤波器 - float low_output = low_cutoff * input + (1.0f - low_cutoff) * low_state; - low_state = low_output; - - // 高通滤波器 - float high_input = input - low_output; - float high_output = high_cutoff * high_input + (1.0f - high_cutoff) * high_state; - high_state = high_output; - - // 中频 - float mid_output = (input - low_output - high_output) * mid_factor; - - // 混合并应用增益 - dst[i] = low_output * low_gain + mid_output * mid_gain + high_output * high_gain; - } - - // 更新EQ状态 - if (eq_state != nullptr) { - eq_state[0] = low_state; - eq_state[1] = high_state; - } - } - - // 简单均衡器函数实现 (AVX版本) - void simple_eq_avx(const float* src, float* dst, float low_gain, float mid_gain, float high_gain, float* eq_state, - size_t num_samples) { - ASSERT_ALIGNED(src, ALIGNMENT_AVX); - ASSERT_ALIGNED(dst, ALIGNMENT_AVX); - - // 边界情况处理 - if (num_samples == 0) { - return; - } - - constexpr size_t simd_width = 8; // AVX每次处理8个float - constexpr size_t unroll_factor = 4; // 4路循环展开提高指令级并行性 - - // 简化的频率分割系数 - constexpr float low_cutoff = 0.02f; // 简化的低通滤波器系数 - constexpr float high_cutoff = 0.1f; // 简化的高通滤波器系数 - constexpr float mid_factor = 0.7f; // 中频保持系数 - - // 初始化EQ状态 - float low_state = eq_state != nullptr ? eq_state[0] : 0.0f; - float high_state = eq_state != nullptr ? eq_state[1] : 0.0f; - - // 创建增益向量 - const auto low_gain_vec = _mm256_set1_ps(low_gain); - const auto mid_gain_vec = _mm256_set1_ps(mid_gain); - const auto high_gain_vec = _mm256_set1_ps(high_gain); - const auto low_cutoff_vec = _mm256_set1_ps(low_cutoff); - const auto high_cutoff_vec = _mm256_set1_ps(high_cutoff); - const auto mid_factor_vec = _mm256_set1_ps(mid_factor); - const auto one_minus_low_cutoff_vec = _mm256_set1_ps(1.0f - low_cutoff); - const auto one_minus_high_cutoff_vec = _mm256_set1_ps(1.0f - high_cutoff); - - size_t i = 0; - - // 向量化处理(4路循环展开) - for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) { - // 加载4个向量 - auto input0 = _mm256_load_ps(&src[i]); - auto input1 = _mm256_load_ps(&src[i + 8]); - auto input2 = _mm256_load_ps(&src[i + 16]); - auto input3 = _mm256_load_ps(&src[i + 24]); - - // 简化的低通滤波器实现(一阶IIR) - auto low_state_vec = _mm256_set1_ps(low_state); - auto low0 = _mm256_add_ps(_mm256_mul_ps(input0, low_cutoff_vec), - _mm256_mul_ps(low_state_vec, one_minus_low_cutoff_vec)); - auto low1 = _mm256_add_ps(_mm256_mul_ps(input1, low_cutoff_vec), - _mm256_mul_ps(low0, one_minus_low_cutoff_vec)); - auto low2 = _mm256_add_ps(_mm256_mul_ps(input2, low_cutoff_vec), - _mm256_mul_ps(low1, one_minus_low_cutoff_vec)); - auto low3 = _mm256_add_ps(_mm256_mul_ps(input3, low_cutoff_vec), - _mm256_mul_ps(low2, one_minus_low_cutoff_vec)); - - // 简化的高通滤波器实现 - auto high0 = _mm256_sub_ps(input0, low0); - auto high1 = _mm256_sub_ps(input1, low1); - auto high2 = _mm256_sub_ps(input2, low2); - auto high3 = _mm256_sub_ps(input3, low3); - - // 进一步高频处理 - auto high_state_vec = _mm256_set1_ps(high_state); - high0 = _mm256_add_ps(_mm256_mul_ps(high0, high_cutoff_vec), - _mm256_mul_ps(high_state_vec, one_minus_high_cutoff_vec)); - high1 = _mm256_add_ps(_mm256_mul_ps(high1, high_cutoff_vec), - _mm256_mul_ps(high0, one_minus_high_cutoff_vec)); - high2 = _mm256_add_ps(_mm256_mul_ps(high2, high_cutoff_vec), - _mm256_mul_ps(high1, one_minus_high_cutoff_vec)); - high3 = _mm256_add_ps(_mm256_mul_ps(high3, high_cutoff_vec), - _mm256_mul_ps(high2, one_minus_high_cutoff_vec)); - - // 中频:原始信号减去低频和高频 - auto mid0 = _mm256_mul_ps(_mm256_sub_ps(_mm256_sub_ps(input0, low0), high0), mid_factor_vec); - auto mid1 = _mm256_mul_ps(_mm256_sub_ps(_mm256_sub_ps(input1, low1), high1), mid_factor_vec); - auto mid2 = _mm256_mul_ps(_mm256_sub_ps(_mm256_sub_ps(input2, low2), high2), mid_factor_vec); - auto mid3 = _mm256_mul_ps(_mm256_sub_ps(_mm256_sub_ps(input3, low3), high3), mid_factor_vec); - - // 应用增益并混合 - auto result0 = _mm256_add_ps( - _mm256_add_ps(_mm256_mul_ps(low0, low_gain_vec), _mm256_mul_ps(mid0, mid_gain_vec)), - _mm256_mul_ps(high0, high_gain_vec)); - auto result1 = _mm256_add_ps( - _mm256_add_ps(_mm256_mul_ps(low1, low_gain_vec), _mm256_mul_ps(mid1, mid_gain_vec)), - _mm256_mul_ps(high1, high_gain_vec)); - auto result2 = _mm256_add_ps( - _mm256_add_ps(_mm256_mul_ps(low2, low_gain_vec), _mm256_mul_ps(mid2, mid_gain_vec)), - _mm256_mul_ps(high2, high_gain_vec)); - auto result3 = _mm256_add_ps( - _mm256_add_ps(_mm256_mul_ps(low3, low_gain_vec), _mm256_mul_ps(mid3, mid_gain_vec)), - _mm256_mul_ps(high3, high_gain_vec)); - - // 存储结果 - _mm256_store_ps(&dst[i], result0); - _mm256_store_ps(&dst[i + 8], result1); - _mm256_store_ps(&dst[i + 16], result2); - _mm256_store_ps(&dst[i + 24], result3); - - // 更新状态(使用最后一个元素) - auto low_temp = _mm256_extractf128_ps(low3, 1); - low_state = _mm_cvtss_f32(_mm_shuffle_ps(low_temp, low_temp, _MM_SHUFFLE(3, 3, 3, 3))); - auto high_temp = _mm256_extractf128_ps(high3, 1); - high_state = _mm_cvtss_f32(_mm_shuffle_ps(high_temp, high_temp, _MM_SHUFFLE(3, 3, 3, 3))); - } - - // 处理剩余的标量样本 - for (; i < num_samples; ++i) { - float input = src[i]; - - // 低通滤波器 - float low_output = low_cutoff * input + (1.0f - low_cutoff) * low_state; - low_state = low_output; - - // 高通滤波器 - float high_input = input - low_output; - float high_output = high_cutoff * high_input + (1.0f - high_cutoff) * high_state; - high_state = high_output; - - // 中频 - float mid_output = (input - low_output - high_output) * mid_factor; - - // 混合并应用增益 - dst[i] = low_output * low_gain + mid_output * mid_gain + high_output * high_gain; - } - - // 更新EQ状态 - if (eq_state != nullptr) { - eq_state[0] = low_state; - eq_state[1] = high_state; - } - } - - // 简单均衡器函数实现 (AVX512版本) - void simple_eq_avx512(const float* src, float* dst, float low_gain, float mid_gain, float high_gain, - float* eq_state, size_t num_samples) { - ASSERT_ALIGNED(src, ALIGNMENT_AVX512); - ASSERT_ALIGNED(dst, ALIGNMENT_AVX512); - - // 边界情况处理 - if (num_samples == 0) { - return; - } - - constexpr size_t simd_width = 16; // AVX512每次处理16个float - constexpr size_t unroll_factor = 4; // 4路循环展开提高指令级并行性 - - // 简化的频率分割系数 - constexpr float low_cutoff = 0.02f; // 简化的低通滤波器系数 - constexpr float high_cutoff = 0.1f; // 简化的高通滤波器系数 - constexpr float mid_factor = 0.7f; // 中频保持系数 - - // 初始化EQ状态 - float low_state = eq_state != nullptr ? eq_state[0] : 0.0f; - float high_state = eq_state != nullptr ? eq_state[1] : 0.0f; - - // 创建增益向量 - const auto low_gain_vec = _mm512_set1_ps(low_gain); - const auto mid_gain_vec = _mm512_set1_ps(mid_gain); - const auto high_gain_vec = _mm512_set1_ps(high_gain); - const auto low_cutoff_vec = _mm512_set1_ps(low_cutoff); - const auto high_cutoff_vec = _mm512_set1_ps(high_cutoff); - const auto mid_factor_vec = _mm512_set1_ps(mid_factor); - const auto one_minus_low_cutoff_vec = _mm512_set1_ps(1.0f - low_cutoff); - const auto one_minus_high_cutoff_vec = _mm512_set1_ps(1.0f - high_cutoff); - - size_t i = 0; - - // 向量化处理(4路循环展开) - for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) { - // 加载4个向量 - auto input0 = _mm512_load_ps(&src[i]); - auto input1 = _mm512_load_ps(&src[i + 16]); - auto input2 = _mm512_load_ps(&src[i + 32]); - auto input3 = _mm512_load_ps(&src[i + 48]); - - // 简化的低通滤波器实现(一阶IIR) - auto low_state_vec = _mm512_set1_ps(low_state); - auto low0 = _mm512_fmadd_ps(input0, low_cutoff_vec, _mm512_mul_ps(low_state_vec, one_minus_low_cutoff_vec)); - auto low1 = _mm512_fmadd_ps(input1, low_cutoff_vec, _mm512_mul_ps(low0, one_minus_low_cutoff_vec)); - auto low2 = _mm512_fmadd_ps(input2, low_cutoff_vec, _mm512_mul_ps(low1, one_minus_low_cutoff_vec)); - auto low3 = _mm512_fmadd_ps(input3, low_cutoff_vec, _mm512_mul_ps(low2, one_minus_low_cutoff_vec)); - - // 简化的高通滤波器实现 - auto high0 = _mm512_sub_ps(input0, low0); - auto high1 = _mm512_sub_ps(input1, low1); - auto high2 = _mm512_sub_ps(input2, low2); - auto high3 = _mm512_sub_ps(input3, low3); - - // 进一步高频处理 - auto high_state_vec = _mm512_set1_ps(high_state); - high0 = _mm512_fmadd_ps(high0, high_cutoff_vec, _mm512_mul_ps(high_state_vec, one_minus_high_cutoff_vec)); - high1 = _mm512_fmadd_ps(high1, high_cutoff_vec, _mm512_mul_ps(high0, one_minus_high_cutoff_vec)); - high2 = _mm512_fmadd_ps(high2, high_cutoff_vec, _mm512_mul_ps(high1, one_minus_high_cutoff_vec)); - high3 = _mm512_fmadd_ps(high3, high_cutoff_vec, _mm512_mul_ps(high2, one_minus_high_cutoff_vec)); - - // 中频:原始信号减去低频和高频 - auto mid0 = _mm512_mul_ps(_mm512_sub_ps(_mm512_sub_ps(input0, low0), high0), mid_factor_vec); - auto mid1 = _mm512_mul_ps(_mm512_sub_ps(_mm512_sub_ps(input1, low1), high1), mid_factor_vec); - auto mid2 = _mm512_mul_ps(_mm512_sub_ps(_mm512_sub_ps(input2, low2), high2), mid_factor_vec); - auto mid3 = _mm512_mul_ps(_mm512_sub_ps(_mm512_sub_ps(input3, low3), high3), mid_factor_vec); - - // 应用增益并混合 (使用FMA指令优化) - auto result0 = _mm512_fmadd_ps(low0, low_gain_vec, - _mm512_fmadd_ps(mid0, mid_gain_vec, _mm512_mul_ps(high0, high_gain_vec))); - auto result1 = _mm512_fmadd_ps(low1, low_gain_vec, - _mm512_fmadd_ps(mid1, mid_gain_vec, _mm512_mul_ps(high1, high_gain_vec))); - auto result2 = _mm512_fmadd_ps(low2, low_gain_vec, - _mm512_fmadd_ps(mid2, mid_gain_vec, _mm512_mul_ps(high2, high_gain_vec))); - auto result3 = _mm512_fmadd_ps(low3, low_gain_vec, - _mm512_fmadd_ps(mid3, mid_gain_vec, _mm512_mul_ps(high3, high_gain_vec))); - - // 存储结果 - _mm512_store_ps(&dst[i], result0); - _mm512_store_ps(&dst[i + 16], result1); - _mm512_store_ps(&dst[i + 32], result2); - _mm512_store_ps(&dst[i + 48], result3); - - // 更新状态(使用最后一个元素) - __m128 low_temp = _mm512_extractf32x4_ps(low3, 3); - low_state = _mm_cvtss_f32(low_temp); - __m128 high_temp = _mm512_extractf32x4_ps(high3, 3); - high_state = _mm_cvtss_f32(high_temp); - } - - // 处理剩余的标量样本 - for (; i < num_samples; ++i) { - float input = src[i]; - - // 低通滤波器 - float low_output = low_cutoff * input + (1.0f - low_cutoff) * low_state; - low_state = low_output; - - // 高通滤波器 - float high_input = input - low_output; - float high_output = high_cutoff * high_input + (1.0f - high_cutoff) * high_state; - high_state = high_output; - - // 中频 - float mid_output = (input - low_output - high_output) * mid_factor; - - // 混合并应用增益 - dst[i] = low_output * low_gain + mid_output * mid_gain + high_output * high_gain; - } - - // 更新EQ状态 - if (eq_state != nullptr) { - eq_state[0] = low_state; - eq_state[1] = high_state; - } - } -} -#endif diff --git a/src/simd/misc/CMakeLists.txt b/src/simd/misc/CMakeLists.txt new file mode 100644 index 0000000..fb107ac --- /dev/null +++ b/src/simd/misc/CMakeLists.txt @@ -0,0 +1,8 @@ +project(alicho_simd) + +simple_library(STATIC) +target_link_libraries(${PROJECT_NAME} PUBLIC alicho_misc alicho_simd_interface) + +if(UNIX AND NOT APPLE) + target_link_libraries(${PROJECT_NAME} PUBLIC dl) +endif() diff --git a/src/simd/audio_processing/arm_simd_audio_processing_func.cpp b/src/simd/misc/audio_processing/arm_simd_audio_processing_func.cpp similarity index 100% rename from src/simd/audio_processing/arm_simd_audio_processing_func.cpp rename to src/simd/misc/audio_processing/arm_simd_audio_processing_func.cpp diff --git a/src/simd/audio_processing/arm_simd_audio_processing_func.h b/src/simd/misc/audio_processing/arm_simd_audio_processing_func.h similarity index 100% rename from src/simd/audio_processing/arm_simd_audio_processing_func.h rename to src/simd/misc/audio_processing/arm_simd_audio_processing_func.h diff --git a/src/simd/audio_processing/scalar_audio_processing_func.cpp b/src/simd/misc/audio_processing/scalar_audio_processing_func.cpp similarity index 100% rename from src/simd/audio_processing/scalar_audio_processing_func.cpp rename to src/simd/misc/audio_processing/scalar_audio_processing_func.cpp diff --git a/src/simd/audio_processing/scalar_audio_processing_func.h b/src/simd/misc/audio_processing/scalar_audio_processing_func.h similarity index 99% rename from src/simd/audio_processing/scalar_audio_processing_func.h rename to src/simd/misc/audio_processing/scalar_audio_processing_func.h index 7894911..6765fe4 100644 --- a/src/simd/audio_processing/scalar_audio_processing_func.h +++ b/src/simd/misc/audio_processing/scalar_audio_processing_func.h @@ -31,6 +31,7 @@ #pragma once #include #include +#include /** * @namespace scalar_audio_processing_func diff --git a/src/simd/audio_processing/x86_simd_audio_processing_func.h b/src/simd/misc/audio_processing/x86_simd_audio_processing_func.h similarity index 99% rename from src/simd/audio_processing/x86_simd_audio_processing_func.h rename to src/simd/misc/audio_processing/x86_simd_audio_processing_func.h index 93e4751..b3e8b84 100644 --- a/src/simd/audio_processing/x86_simd_audio_processing_func.h +++ b/src/simd/misc/audio_processing/x86_simd_audio_processing_func.h @@ -87,6 +87,7 @@ */ #pragma once +#include #if ALICHO_PLATFORM_X86 /** diff --git a/src/simd/cpu_features.cpp b/src/simd/misc/cpu_features.cpp similarity index 99% rename from src/simd/cpu_features.cpp rename to src/simd/misc/cpu_features.cpp index 9aa1351..b20974c 100644 --- a/src/simd/cpu_features.cpp +++ b/src/simd/misc/cpu_features.cpp @@ -293,9 +293,9 @@ void cpu_feature_detector::detect_x86_features() { if ((cpuid_7.ebx & (1 << 30)) != 0) { info_.features |= static_cast(cpu_feature::AVX512BW); } // bit 30: AVX-512 Byte/Word(字节/字操作) if ((cpuid_7.ebx & (1 << 17)) != 0) { info_.features |= static_cast(cpu_feature::AVX512DQ); } // bit 17: AVX-512 DQ(双字/四字操作) - // 从CPUID.7.0.ECX寄存提取更多AVX-512扩展特性 + // 从CPUID.7.0.ECX寄存器提取更多AVX-512扩展特性 if ((cpuid_7.ecx & (1 << 21)) != 0) { info_.features |= static_cast(cpu_feature::AVX512IFMA); } // bit 21: AVX-512 IFMA(整数融合乘加) - if ((cpuid_7.ecx & (1 << 2)) != 0) { info_.features |= static_cast(cpu_feature::AVX512VBMI); } // bit 1: AVX-512 VBMI(向量字节操作) + if ((cpuid_7.ecx & (1 << 1)) != 0) { info_.features |= static_cast(cpu_feature::AVX512VBMI); } // bit 1: AVX-512 VBMI(向量字节操作) // ======================================================================== // 步骤4: 确定最高可用的SIMD级别 diff --git a/src/simd/cpu_features.h b/src/simd/misc/cpu_features.h similarity index 100% rename from src/simd/cpu_features.h rename to src/simd/misc/cpu_features.h diff --git a/src/simd/misc/simd_api.h b/src/simd/misc/simd_api.h new file mode 100644 index 0000000..6877a5b --- /dev/null +++ b/src/simd/misc/simd_api.h @@ -0,0 +1,95 @@ +#pragma once + +#include "simd_func_dispatcher.h" + +namespace simd { + +// ============================================================================ +// SIMD函数的零开销包装接口 +// +// 这些inline函数会被编译器优化为直接的函数指针调用, +// 实现零开销的SIMD函数调度 +// ============================================================================ + +inline void fill_buffer(float* buffer, float value, size_t num_samples) { + simd_func_dispatcher::instance().get_fill_buffer()(buffer, value, num_samples); +} + +inline void mix_audio(const float* src1, const float* src2, + float* dst, size_t num_samples) { + simd_func_dispatcher::instance().get_mix_audio()(src1, src2, dst, num_samples); +} + +inline void apply_gain(const float* src, float* dst, + float gain, size_t num_samples) { + simd_func_dispatcher::instance().get_apply_gain()(src, dst, gain, num_samples); +} + +inline float calculate_rms(const float* src, size_t num_samples) { + return simd_func_dispatcher::instance().get_calculate_rms()(src, num_samples); +} + +inline float calculate_peak(const float* src, size_t num_samples) { + return simd_func_dispatcher::instance().get_calculate_peak()(src, num_samples); +} + +inline void normalize_audio(const float* src, float* dst, + float target_peak, size_t num_samples) { + simd_func_dispatcher::instance().get_normalize_audio()( + src, dst, target_peak, num_samples); +} + +inline void stereo_to_mono(const float* stereo_src, float* mono_dst, + size_t num_stereo_samples) { + simd_func_dispatcher::instance().get_stereo_to_mono()( + stereo_src, mono_dst, num_stereo_samples); +} + +inline void limit_audio(const float* src, float* dst, float threshold, + float* limiter_state, float sample_rate, size_t num_samples) { + simd_func_dispatcher::instance().get_limit_audio()( + src, dst, threshold, limiter_state, sample_rate, num_samples); +} + +inline void fade_audio(const float* src, float* dst, size_t fade_in_samples, + size_t fade_out_samples, size_t num_samples) { + simd_func_dispatcher::instance().get_fade_audio()( + src, dst, fade_in_samples, fade_out_samples, num_samples); +} + +inline void simple_eq(const float* src, float* dst, float low_gain, + float mid_gain, float high_gain, float* eq_state, + size_t num_samples) { + simd_func_dispatcher::instance().get_simple_eq()( + src, dst, low_gain, mid_gain, high_gain, eq_state, num_samples); +} + +// ============================================================================ +// 调试和信息接口 +// ============================================================================ + +/// 获取当前激活的SIMD版本 +/// @return 当前使用的SIMD指令集版本 +inline auto get_active_simd_version() -> simd_func_version { + return simd_func_dispatcher::instance().get_active_version(); +} + +/// 获取当前激活的SIMD版本的字符串表示 +/// @return 版本名称(如 "AVX2", "NEON", "SCALAR" 等) +inline auto get_active_simd_version_string() -> std::string { + auto version = get_active_simd_version(); + switch (version) { + case simd_func_version::SCALAR: return "SCALAR"; + case simd_func_version::SSE: return "SSE"; + case simd_func_version::SSE3: return "SSE3"; + case simd_func_version::SSE4: return "SSE4"; + case simd_func_version::AVX: return "AVX"; + case simd_func_version::AVX2: return "AVX2"; + case simd_func_version::AVX512: return "AVX512"; + case simd_func_version::NEON: return "NEON"; + case simd_func_version::NEON_FP16: return "NEON_FP16"; + default: return "UNKNOWN"; + } +} + +} // namespace simd \ No newline at end of file diff --git a/src/simd/misc/simd_func_dispatcher.cpp b/src/simd/misc/simd_func_dispatcher.cpp new file mode 100644 index 0000000..9528f01 --- /dev/null +++ b/src/simd/misc/simd_func_dispatcher.cpp @@ -0,0 +1,201 @@ +#include "simd_func_dispatcher.h" +#include "cpu_features.h" +#include "logger.h" + +#include + +#define SIMD_FUNC_DISPATCHER_LOG_MODULE "simd_func_dispatcher" + +// 将 simd_level 映射到 simd_func_version +static std::string_view version_to_lib_name(simd_func_version version) { + switch (version) { + case simd_func_version::SCALAR: return "scaler"; + case simd_func_version::SSE: return "sse"; + case simd_func_version::AVX: return "avx"; + case simd_func_version::AVX2: return "avx2"; + case simd_func_version::AVX512: return "avx512"; + case simd_func_version::NEON: return "neon"; + case simd_func_version::NEON_FP16: return "neon_fp16"; + default: return ""; + } +} + +simd_func_dispatcher::simd_func_dispatcher() { + for (int i = 0; i < static_cast(simd_func_version::COUNT); ++i) { + auto version = static_cast(i); + auto lib_suffix = version_to_lib_name(version); + if (lib_suffix.empty()) { + continue; + } + + std::string lib_name; +#if ALICHO_PLATFORM_WINDOWS + lib_name = "alicho_simd_" + std::string(lib_suffix) + ".dll"; +#elif ALICHO_PLATFORM_LINUX + lib_name = "./libalicho_simd_" + std::string(lib_suffix) + ".so"; +#elif ALICHO_PLATFORM_APPLE + lib_name = "./libalicho_simd_" + std::string(lib_suffix) + ".dylib"; +#endif + + auto handle = std::make_unique(); + if (handle->open(lib_name)) { + log_module_info(SIMD_FUNC_DISPATCHER_LOG_MODULE, "Successfully loaded SIMD library: {}", lib_name); + loaded_libraries_[version] = std::move(handle); + } else { + log_module_debug(SIMD_FUNC_DISPATCHER_LOG_MODULE, "Could not find or load SIMD library: {}", lib_name); + } + } + + // 初始化函数指针 + initialize_function_pointers(); +} + +simd_func_dispatcher::~simd_func_dispatcher() = default; + +// 初始化函数指针 +void simd_func_dispatcher::initialize_function_pointers() { + // 1. 检测CPU能力 + const auto& cpu_info = get_cpu_info(); + auto recommended_level = get_recommended_simd_level(); + auto preferred_version = simd_level_to_version(recommended_level); + + log_module_info(SIMD_FUNC_DISPATCHER_LOG_MODULE, "检测到CPU最高SIMD级别: {}", static_cast(cpu_info.max_simd_level)); + log_module_info(SIMD_FUNC_DISPATCHER_LOG_MODULE, "推荐使用SIMD级别: {}", static_cast(recommended_level)); + + // 2. 找到可用版本 + auto target_version = find_fallback_version(preferred_version); + + log_module_info(SIMD_FUNC_DISPATCHER_LOG_MODULE, "选择的SIMD版本: {}", static_cast(target_version)); + + // 3. 加载函数指针 + if (!try_load_functions(target_version)) { + throw std::runtime_error("Failed to load SIMD functions for any available version"); + } + + active_version_ = target_version; + log_module_info(SIMD_FUNC_DISPATCHER_LOG_MODULE, "成功初始化SIMD函数调度器,激活版本: {}", static_cast(active_version_)); +} + +// 回退策略 +auto simd_func_dispatcher::find_fallback_version(simd_func_version preferred) -> simd_func_version { + // 定义回退序列 + #if ALICHO_PLATFORM_X86 + // x86/x64 回退序列:从高到低 + static const simd_func_version x86_fallback[] = { + simd_func_version::AVX512, + simd_func_version::AVX2, + simd_func_version::AVX, + simd_func_version::SSE, + simd_func_version::SCALAR + }; + + // 从首选版本开始查找 + bool found_preferred = false; + for (auto version : x86_fallback) { + if (version == preferred) { + found_preferred = true; + } + + if (found_preferred && loaded_libraries_.count(version) > 0) { + log_module_info(SIMD_FUNC_DISPATCHER_LOG_MODULE, "找到可用的SIMD版本: {}", static_cast(version)); + return version; + } + } + + #elif ALICHO_PLATFORM_ARM + // ARM 回退序列 + static const simd_func_version arm_fallback[] = { + simd_func_version::NEON_FP16, + simd_func_version::NEON, + simd_func_version::SCALAR + }; + + bool found_preferred = false; + for (auto version : arm_fallback) { + if (version == preferred) { + found_preferred = true; + } + + if (found_preferred && loaded_libraries_.count(version) > 0) { + log_module_info(SIMD_FUNC_DISPATCHER_LOG_MODULE, "找到可用的SIMD版本: {}", static_cast(version)); + return version; + } + } + #endif + + // 最后回退到标量版本 + if (loaded_libraries_.count(simd_func_version::SCALAR) > 0) { + log_module_info(SIMD_FUNC_DISPATCHER_LOG_MODULE, "回退到标量版本"); + return simd_func_version::SCALAR; + } + + throw std::runtime_error("No SIMD library available, not even scalar version"); +} + +// 尝试从指定版本加载函数 +auto simd_func_dispatcher::try_load_functions(simd_func_version version) -> bool { + // 检查库是否已加载 + auto it = loaded_libraries_.find(version); + if (it == loaded_libraries_.end()) { + log_module_error(SIMD_FUNC_DISPATCHER_LOG_MODULE, "SIMD库版本 {} 未加载", static_cast(version)); + return false; + } + + auto* handle = it->second.get(); + + // 加载所有10个函数 + fill_buffer_ = load_function(handle, "fill_buffer"); + mix_audio_ = load_function(handle, "mix_audio"); + apply_gain_ = load_function(handle, "apply_gain"); + calculate_rms_ = load_function(handle, "calculate_rms"); + calculate_peak_ = load_function(handle, "calculate_peak"); + normalize_audio_ = load_function(handle, "normalize_audio"); + stereo_to_mono_ = load_function(handle, "stereo_to_mono"); + limit_audio_ = load_function(handle, "limit_audio"); + fade_audio_ = load_function(handle, "fade_audio"); + simple_eq_ = load_function(handle, "simple_eq"); + + // 检查是否所有函数都加载成功 + if (!fill_buffer_ || !mix_audio_ || !apply_gain_ || !calculate_rms_ || + !calculate_peak_ || !normalize_audio_ || !stereo_to_mono_ || + !limit_audio_ || !fade_audio_ || !simple_eq_) { + log_module_error(SIMD_FUNC_DISPATCHER_LOG_MODULE, "加载SIMD函数失败,版本: {}", static_cast(version)); + return false; + } + + log_module_info(SIMD_FUNC_DISPATCHER_LOG_MODULE, "成功加载所有SIMD函数,版本: {}", static_cast(version)); + return true; +} + +// 从lib_handle加载单个函数 +template +auto simd_func_dispatcher::load_function(lib_handle* handle, const std::string& name) -> FuncT { + if (!handle) { + log_module_error(SIMD_FUNC_DISPATCHER_LOG_MODULE, "lib_handle 为空,无法加载函数: {}", name); + return nullptr; + } + + // FuncT 现在已经是函数指针类型 (如 void(*)(float*, float, size_t)) + // 移除指针得到函数签名类型 (如 void(float*, float, size_t)) + using func_signature = std::remove_pointer_t; + + // 调用 lib_handle::get_function_by_name() 获取 std::function + auto std_func = handle->get_function_by_name(name); + + if (!std_func) { + log_module_error(SIMD_FUNC_DISPATCHER_LOG_MODULE, "加载函数失败: {}", name); + return nullptr; + } + + // 从 std::function 获取底层函数指针 + // std::function::target() 返回指向目标可调用对象的指针 + auto* func_ptr = std_func.template target(); + + if (!func_ptr || !*func_ptr) { + log_module_error(SIMD_FUNC_DISPATCHER_LOG_MODULE, "转换函数指针失败: {}", name); + return nullptr; + } + + log_module_debug(SIMD_FUNC_DISPATCHER_LOG_MODULE, "成功加载函数: {}", name); + return *func_ptr; +} diff --git a/src/simd/misc/simd_func_dispatcher.h b/src/simd/misc/simd_func_dispatcher.h new file mode 100644 index 0000000..b1a30fa --- /dev/null +++ b/src/simd/misc/simd_func_dispatcher.h @@ -0,0 +1,223 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include "lib_handle.h" + +#include "cpu_features.h" +#include "simd_interface.h" + +/** + * @enum simd_func_version + * @brief SIMD函数版本枚举 - 定义所有可能的函数实现版本 + * + * 该枚举定义了函数可以有的所有SIMD优化版本。 + * 每个版本对应一个特定的SIMD指令集级别。 + * + * 版本排序: + * - 枚举值从低到高表示性能从弱到强 + * - SCALAR是最基础的版本,所有CPU都支持 + * - COUNT用于数组大小,不是实际版本 + * + * 与simd_level的关系: + * - simd_level表示CPU的能力级别 + * - simd_func_version表示函数的实现版本 + * - 通过simd_level_to_version()进行转换 + * + * @note 不是所有函数都需要实现所有版本 + * @see simd_level, simd_level_to_version() + */ +enum class simd_func_version { + /** 标量实现 - 纯C++代码,无SIMD优化 + * - 兼容性:所有CPU + * - 性能:基准性能(1x) + * - 用途:最低保底实现、参考实现 + * - 必要性:强制要求,作为回退版本 + */ + SCALAR = 0, + + /** SSE实现 - 使用SSE/SSE2指令 + * - 兼容性:2003年后的所有x86/x64 + * - 向量宽度:128位 + * - 性能提升:约2-4倍 + */ + SSE, + + /** SSE3实现 - 使用SSE3/SSSE3指令 + * - 兼容性:2006年后的主流CPU + * - 新增功能:水平运算、复数支持 + * - 性能提升:比SSE快10-20% + */ + SSE3, + + /** SSE4实现 - 使用SSE4.1/SSE4.2指令 + * - 兼容性:2008年后的主流CPU + * - 新增功能:点积、blend、字符串处理 + * - 性能提升:比SSE3快15-30% + */ + SSE4, + + /** AVX实现 - 使用AVX指令 + * - 兼容性:2011年后的主流CPU + * - 向量宽度:256位 + * - 性能提升:约2倍SSE4性能 + */ + AVX, + + /** AVX2实现 - 使用AVX2 + FMA指令 + * - 兼容性:2013年后的主流CPU + * - 新增功能:完整256位整数运算、FMA + * - 性能提升:比AVX快50-100% + * - 推荐:当前最佳性能/兼容性平衡点 + */ + AVX2, + + /** AVX-512实现 - 使用AVX-512指令集 + * - 兼容性:2016年后的高端CPU + * - 向量宽度:512位 + * - 性能提升:约2倍AVX2性能(理论) + * - 注意:可能导致CPU降频 + */ + AVX512, + + /** NEON实现 - 使用ARM NEON指令 + * - 兼容性:所有ARMv8-A (64位ARM) + * - 向量宽度:128位 + * - 性能:与SSE4相当 + * - 应用:移动设备、Apple Silicon + */ + NEON, + + /** NEON + FP16实现 - 使用NEON半精度浮点 + * - 兼容性:ARMv8.2-A及更新 + * - 新增:硬件FP16运算 + * - 性能:FP16运算快2倍 + * - 应用:移动端AI推理 + */ + NEON_FP16, + + /** RISC-V向量扩展实现 + * - 兼容性:支持RVV的RISC-V处理器 + * - 特点:可变向量长度 + * - 应用:嵌入式、IoT + */ + VECTOR, + + /** 版本数量标记 + * 用于数组大小定义,不是实际的函数版本 + */ + COUNT +}; + +/** + * @brief 将SIMD级别转换为函数版本 + * @param level CPU的SIMD级别 + * @return 对应的函数版本枚举值 + * + * 将cpu_feature_detector检测到的SIMD级别转换为 + * 函数调度器使用的版本标识。 + * + * 映射关系: + * - simd_level::NONE -> simd_func_version::SCALAR + * - simd_level::SSE -> simd_func_version::SSE + * - simd_level::AVX2 -> simd_func_version::AVX2 + * - 等等... + * + * @note constexpr函数,编译时求值,零运行时开销 + * @see simd_level, simd_func_version + */ +constexpr auto simd_level_to_version(simd_level level) { + switch (level) { + case simd_level::NONE: + return simd_func_version::SCALAR; + case simd_level::SSE: + return simd_func_version::SSE; + case simd_level::SSE3: + return simd_func_version::SSE; + case simd_level::SSE4: + return simd_func_version::SSE; + case simd_level::AVX: + return simd_func_version::AVX; + case simd_level::AVX2: + return simd_func_version::AVX2; + case simd_level::AVX512: + return simd_func_version::AVX512; + case simd_level::NEON: + return simd_func_version::NEON; + case simd_level::NEON_FP16: + return simd_func_version::NEON_FP16; + } + + // 默认回退到标量版本 + return simd_func_version::SCALAR; +} + +class simd_func_dispatcher : public lazy_singleton { +public: + friend class lazy_singleton; + + // 函数签名类型定义(使用 decltype 从 simd_interface.h 推导) + using fill_buffer_t = decltype(&fill_buffer); + using mix_audio_t = decltype(&mix_audio); + using apply_gain_t = decltype(&apply_gain); + using calculate_rms_t = decltype(&calculate_rms); + using calculate_peak_t = decltype(&calculate_peak); + using normalize_audio_t = decltype(&normalize_audio); + using stereo_to_mono_t = decltype(&stereo_to_mono); + using limit_audio_t = decltype(&limit_audio); + using fade_audio_t = decltype(&fade_audio); + using simple_eq_t = decltype(&simple_eq); + + // 获取函数指针的接口 + [[nodiscard]] auto get_fill_buffer() const noexcept -> fill_buffer_t { return fill_buffer_; } + [[nodiscard]] auto get_mix_audio() const noexcept -> mix_audio_t { return mix_audio_; } + [[nodiscard]] auto get_apply_gain() const noexcept -> apply_gain_t { return apply_gain_; } + [[nodiscard]] auto get_calculate_rms() const noexcept -> calculate_rms_t { return calculate_rms_; } + [[nodiscard]] auto get_calculate_peak() const noexcept -> calculate_peak_t { return calculate_peak_; } + [[nodiscard]] auto get_normalize_audio() const noexcept -> normalize_audio_t { return normalize_audio_; } + [[nodiscard]] auto get_stereo_to_mono() const noexcept -> stereo_to_mono_t { return stereo_to_mono_; } + [[nodiscard]] auto get_limit_audio() const noexcept -> limit_audio_t { return limit_audio_; } + [[nodiscard]] auto get_fade_audio() const noexcept -> fade_audio_t { return fade_audio_; } + [[nodiscard]] auto get_simple_eq() const noexcept -> simple_eq_t { return simple_eq_; } + + [[nodiscard]] auto get_active_version() const noexcept -> simd_func_version { return active_version_; } + +protected: + simd_func_dispatcher(); + ~simd_func_dispatcher() override; + +private: + // 初始化函数指针 + void initialize_function_pointers(); + + // 尝试从指定版本加载函数 + auto try_load_functions(simd_func_version version) -> bool; + + // 回退策略 + auto find_fallback_version(simd_func_version preferred) -> simd_func_version; + + // 从lib_handle加载单个函数 + template + auto load_function(lib_handle* handle, const std::string& name) -> FuncT; + + // 已加载的库映射 + std::unordered_map> loaded_libraries_; + + // 缓存的函数指针 + fill_buffer_t fill_buffer_ = nullptr; + mix_audio_t mix_audio_ = nullptr; + apply_gain_t apply_gain_ = nullptr; + calculate_rms_t calculate_rms_ = nullptr; + calculate_peak_t calculate_peak_ = nullptr; + normalize_audio_t normalize_audio_ = nullptr; + stereo_to_mono_t stereo_to_mono_ = nullptr; + limit_audio_t limit_audio_ = nullptr; + fade_audio_t fade_audio_ = nullptr; + simple_eq_t simple_eq_ = nullptr; + + // 当前激活的版本 + simd_func_version active_version_ = simd_func_version::SCALAR; +}; diff --git a/src/simd/simd_avx/CMakeLists.txt b/src/simd/simd_avx/CMakeLists.txt new file mode 100644 index 0000000..8964ea8 --- /dev/null +++ b/src/simd/simd_avx/CMakeLists.txt @@ -0,0 +1,7 @@ +project(alicho_simd_avx) + +if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64") + simple_library(SHARED) + target_compile_options(${PROJECT_NAME} PRIVATE -mavx2) + target_link_libraries(${PROJECT_NAME} PUBLIC alicho_simd_interface) +endif() diff --git a/src/simd/simd_avx/simd_func.cpp b/src/simd/simd_avx/simd_func.cpp new file mode 100644 index 0000000..447d2ac --- /dev/null +++ b/src/simd/simd_avx/simd_func.cpp @@ -0,0 +1,761 @@ +/** + * @file x86_avx_audio_processing_func.cpp + * @brief x86 AVX音频处理函数实现 + */ + +#include "simd_interface.h" + +#include +#include +#include "aligned_allocator.h" + +#if ALICHO_PLATFORM_X86 + +extern "C" +{ + SIMD_EXPORT void fill_buffer(float *buffer, float value, size_t num_samples) + { + ASSERT_ALIGNED(buffer, ALIGNMENT_AVX); + + constexpr size_t simd_width = 8; + constexpr size_t unroll_factor = 4; + size_t i = 0; + + auto value_vec = _mm256_set1_ps(value); + + for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) + { + _mm256_store_ps(&buffer[i], value_vec); + _mm256_store_ps(&buffer[i + 8], value_vec); + _mm256_store_ps(&buffer[i + 16], value_vec); + _mm256_store_ps(&buffer[i + 24], value_vec); + } + + for (; i + simd_width <= num_samples; i += simd_width) + { + _mm256_store_ps(&buffer[i], value_vec); + } + + for (; i < num_samples; ++i) + { + buffer[i] = value; + } + } + + SIMD_EXPORT void mix_audio(const float *src1, const float *src2, float *dst, size_t num_samples) + { + ASSERT_ALIGNED(src1, ALIGNMENT_AVX); + ASSERT_ALIGNED(src2, ALIGNMENT_AVX); + ASSERT_ALIGNED(dst, ALIGNMENT_AVX); + + constexpr size_t simd_width = 8; + constexpr size_t unroll_factor = 4; + size_t i = 0; + + for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) + { + auto a0 = _mm256_load_ps(&src1[i]); + auto a1 = _mm256_load_ps(&src1[i + 8]); + auto a2 = _mm256_load_ps(&src1[i + 16]); + auto a3 = _mm256_load_ps(&src1[i + 24]); + + auto b0 = _mm256_load_ps(&src2[i]); + auto b1 = _mm256_load_ps(&src2[i + 8]); + auto b2 = _mm256_load_ps(&src2[i + 16]); + auto b3 = _mm256_load_ps(&src2[i + 24]); + + auto result0 = _mm256_add_ps(a0, b0); + auto result1 = _mm256_add_ps(a1, b1); + auto result2 = _mm256_add_ps(a2, b2); + auto result3 = _mm256_add_ps(a3, b3); + + _mm256_store_ps(&dst[i], result0); + _mm256_store_ps(&dst[i + 8], result1); + _mm256_store_ps(&dst[i + 16], result2); + _mm256_store_ps(&dst[i + 24], result3); + } + + for (; i + simd_width <= num_samples; i += simd_width) + { + auto a = _mm256_load_ps(&src1[i]); + auto b = _mm256_load_ps(&src2[i]); + auto result = _mm256_add_ps(a, b); + _mm256_store_ps(&dst[i], result); + } + + for (; i < num_samples; ++i) + { + dst[i] = src1[i] + src2[i]; + } + } + + SIMD_EXPORT void apply_gain(const float *src, float *dst, float gain, size_t num_samples) + { + ASSERT_ALIGNED(src, ALIGNMENT_AVX); + ASSERT_ALIGNED(dst, ALIGNMENT_AVX); + + constexpr size_t simd_width = 8; + constexpr size_t unroll_factor = 4; + size_t i = 0; + + auto gain_vec = _mm256_set1_ps(gain); + + for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) + { + auto a0 = _mm256_load_ps(&src[i]); + auto a1 = _mm256_load_ps(&src[i + 8]); + auto a2 = _mm256_load_ps(&src[i + 16]); + auto a3 = _mm256_load_ps(&src[i + 24]); + + auto result0 = _mm256_mul_ps(a0, gain_vec); + auto result1 = _mm256_mul_ps(a1, gain_vec); + auto result2 = _mm256_mul_ps(a2, gain_vec); + auto result3 = _mm256_mul_ps(a3, gain_vec); + + _mm256_store_ps(&dst[i], result0); + _mm256_store_ps(&dst[i + 8], result1); + _mm256_store_ps(&dst[i + 16], result2); + _mm256_store_ps(&dst[i + 24], result3); + } + + for (; i + simd_width <= num_samples; i += simd_width) + { + auto a = _mm256_load_ps(&src[i]); + auto result = _mm256_mul_ps(a, gain_vec); + _mm256_store_ps(&dst[i], result); + } + + for (; i < num_samples; ++i) + { + dst[i] = src[i] * gain; + } + } + + SIMD_EXPORT float calculate_rms(const float *src, size_t num_samples) + { + ASSERT_ALIGNED(src, ALIGNMENT_AVX); + + constexpr size_t simd_width = 8; + constexpr size_t unroll_factor = 4; + size_t i = 0; + auto sum_squares0 = _mm256_setzero_ps(); + auto sum_squares1 = _mm256_setzero_ps(); + auto sum_squares2 = _mm256_setzero_ps(); + auto sum_squares3 = _mm256_setzero_ps(); + + for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) + { + const auto a0 = _mm256_load_ps(&src[i]); + const auto a1 = _mm256_load_ps(&src[i + 8]); + const auto a2 = _mm256_load_ps(&src[i + 16]); + const auto a3 = _mm256_load_ps(&src[i + 24]); + + const auto squared0 = _mm256_mul_ps(a0, a0); + const auto squared1 = _mm256_mul_ps(a1, a1); + const auto squared2 = _mm256_mul_ps(a2, a2); + const auto squared3 = _mm256_mul_ps(a3, a3); + + sum_squares0 = _mm256_add_ps(sum_squares0, squared0); + sum_squares1 = _mm256_add_ps(sum_squares1, squared1); + sum_squares2 = _mm256_add_ps(sum_squares2, squared2); + sum_squares3 = _mm256_add_ps(sum_squares3, squared3); + } + + auto sum_squares = _mm256_add_ps(_mm256_add_ps(sum_squares0, sum_squares1), + _mm256_add_ps(sum_squares2, sum_squares3)); + + for (; i + simd_width <= num_samples; i += simd_width) + { + const auto a = _mm256_load_ps(&src[i]); + const auto squared = _mm256_mul_ps(a, a); + sum_squares = _mm256_add_ps(sum_squares, squared); + } + + auto hadd1 = _mm256_hadd_ps(sum_squares, sum_squares); + auto hadd2 = _mm256_hadd_ps(hadd1, hadd1); + + auto low = _mm256_extractf128_ps(hadd2, 0); + auto high = _mm256_extractf128_ps(hadd2, 1); + auto final_sum = _mm_add_ps(low, high); + double total_sum = _mm_cvtss_f32(final_sum); + + for (; i < num_samples; ++i) + { + total_sum += static_cast(src[i]) * static_cast(src[i]); + } + + return static_cast(std::sqrt(total_sum / static_cast(num_samples))); + } + + SIMD_EXPORT float calculate_peak(const float *src, size_t num_samples) + { + ASSERT_ALIGNED(src, ALIGNMENT_AVX); + + constexpr size_t simd_width = 8; + constexpr size_t unroll_factor = 4; + size_t i = 0; + auto peak_vec0 = _mm256_setzero_ps(); + auto peak_vec1 = _mm256_setzero_ps(); + auto peak_vec2 = _mm256_setzero_ps(); + auto peak_vec3 = _mm256_setzero_ps(); + + for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) + { + const auto a0 = _mm256_load_ps(&src[i]); + const auto a1 = _mm256_load_ps(&src[i + 8]); + const auto a2 = _mm256_load_ps(&src[i + 16]); + const auto a3 = _mm256_load_ps(&src[i + 24]); + + const auto abs_a0 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a0); + const auto abs_a1 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a1); + const auto abs_a2 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a2); + const auto abs_a3 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a3); + + peak_vec0 = _mm256_max_ps(peak_vec0, abs_a0); + peak_vec1 = _mm256_max_ps(peak_vec1, abs_a1); + peak_vec2 = _mm256_max_ps(peak_vec2, abs_a2); + peak_vec3 = _mm256_max_ps(peak_vec3, abs_a3); + } + + auto peak_vec = _mm256_max_ps(_mm256_max_ps(peak_vec0, peak_vec1), + _mm256_max_ps(peak_vec2, peak_vec3)); + + for (; i + simd_width <= num_samples; i += simd_width) + { + const auto a = _mm256_load_ps(&src[i]); + const auto abs_a = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a); + peak_vec = _mm256_max_ps(peak_vec, abs_a); + } + + auto low = _mm256_extractf128_ps(peak_vec, 0); + auto high = _mm256_extractf128_ps(peak_vec, 1); + auto max_lane = _mm_max_ps(low, high); + + auto temp1 = _mm_shuffle_ps(max_lane, max_lane, _MM_SHUFFLE(2, 3, 0, 1)); + auto max1 = _mm_max_ps(max_lane, temp1); + auto temp2 = _mm_shuffle_ps(max1, max1, _MM_SHUFFLE(1, 0, 3, 2)); + auto final_max = _mm_max_ps(max1, temp2); + float peak = _mm_cvtss_f32(final_max); + + for (; i < num_samples; ++i) + { + float abs_sample = std::fabs(src[i]); + if (abs_sample > peak) + { + peak = abs_sample; + } + } + + return peak; + } + + SIMD_EXPORT void normalize_audio(const float *src, float *dst, float target_peak, size_t num_samples) + { + ASSERT_ALIGNED(src, ALIGNMENT_AVX); + ASSERT_ALIGNED(dst, ALIGNMENT_AVX); + + if (num_samples == 0 || target_peak <= 0.0f) + { + return; + } + + const float current_peak = calculate_peak(src, num_samples); + + if (current_peak < 1e-10f) + { + constexpr size_t simd_width = 8; + constexpr size_t unroll_factor = 4; + auto zero_vec = _mm256_setzero_ps(); + size_t i = 0; + + for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) + { + _mm256_store_ps(&dst[i], zero_vec); + _mm256_store_ps(&dst[i + 8], zero_vec); + _mm256_store_ps(&dst[i + 16], zero_vec); + _mm256_store_ps(&dst[i + 24], zero_vec); + } + + for (; i + simd_width <= num_samples; i += simd_width) + { + _mm256_store_ps(&dst[i], zero_vec); + } + + for (; i < num_samples; ++i) + { + dst[i] = 0.0f; + } + return; + } + + const float gain_factor = target_peak / current_peak; + apply_gain(src, dst, gain_factor, num_samples); + } + + SIMD_EXPORT void stereo_to_mono(const float *stereo_src, float *mono_dst, size_t num_stereo_samples) + { + ASSERT_ALIGNED(stereo_src, ALIGNMENT_AVX); + ASSERT_ALIGNED(mono_dst, ALIGNMENT_AVX); + + if (num_stereo_samples == 0) + { + return; + } + + constexpr size_t simd_width = 8; + constexpr size_t unroll_factor = 4; + const auto half_vec = _mm256_set1_ps(0.5f); + size_t stereo_idx = 0; + size_t mono_idx = 0; + + for (; stereo_idx + simd_width * 2 * unroll_factor <= num_stereo_samples * 2; + stereo_idx += simd_width * 2 * unroll_factor, mono_idx += simd_width * unroll_factor) + { + auto stereo0 = _mm256_load_ps(&stereo_src[stereo_idx]); + auto stereo1 = _mm256_load_ps(&stereo_src[stereo_idx + 8]); + auto stereo2 = _mm256_load_ps(&stereo_src[stereo_idx + 16]); + auto stereo3 = _mm256_load_ps(&stereo_src[stereo_idx + 24]); + auto stereo4 = _mm256_load_ps(&stereo_src[stereo_idx + 32]); + auto stereo5 = _mm256_load_ps(&stereo_src[stereo_idx + 40]); + auto stereo6 = _mm256_load_ps(&stereo_src[stereo_idx + 48]); + auto stereo7 = _mm256_load_ps(&stereo_src[stereo_idx + 56]); + + auto left0 = _mm256_shuffle_ps(stereo0, stereo1, _MM_SHUFFLE(2, 0, 2, 0)); + auto right0 = _mm256_shuffle_ps(stereo0, stereo1, _MM_SHUFFLE(3, 1, 3, 1)); + auto left1 = _mm256_shuffle_ps(stereo2, stereo3, _MM_SHUFFLE(2, 0, 2, 0)); + auto right1 = _mm256_shuffle_ps(stereo2, stereo3, _MM_SHUFFLE(3, 1, 3, 1)); + auto left2 = _mm256_shuffle_ps(stereo4, stereo5, _MM_SHUFFLE(2, 0, 2, 0)); + auto right2 = _mm256_shuffle_ps(stereo4, stereo5, _MM_SHUFFLE(3, 1, 3, 1)); + auto left3 = _mm256_shuffle_ps(stereo6, stereo7, _MM_SHUFFLE(2, 0, 2, 0)); + auto right3 = _mm256_shuffle_ps(stereo6, stereo7, _MM_SHUFFLE(3, 1, 3, 1)); + + left0 = _mm256_permute2f128_ps(left0, left0, 0x01); + right0 = _mm256_permute2f128_ps(right0, right0, 0x01); + left1 = _mm256_permute2f128_ps(left1, left1, 0x01); + right1 = _mm256_permute2f128_ps(right1, right1, 0x01); + left2 = _mm256_permute2f128_ps(left2, left2, 0x01); + right2 = _mm256_permute2f128_ps(right2, right2, 0x01); + left3 = _mm256_permute2f128_ps(left3, left3, 0x01); + right3 = _mm256_permute2f128_ps(right3, right3, 0x01); + + auto mono0 = _mm256_mul_ps(_mm256_add_ps(left0, right0), half_vec); + auto mono1 = _mm256_mul_ps(_mm256_add_ps(left1, right1), half_vec); + auto mono2 = _mm256_mul_ps(_mm256_add_ps(left2, right2), half_vec); + auto mono3 = _mm256_mul_ps(_mm256_add_ps(left3, right3), half_vec); + + _mm256_store_ps(&mono_dst[mono_idx], mono0); + _mm256_store_ps(&mono_dst[mono_idx + 8], mono1); + _mm256_store_ps(&mono_dst[mono_idx + 16], mono2); + _mm256_store_ps(&mono_dst[mono_idx + 24], mono3); + } + + for (size_t i = stereo_idx / 2; i < num_stereo_samples; ++i) + { + const float left = stereo_src[i * 2]; + const float right = stereo_src[i * 2 + 1]; + mono_dst[i] = (left + right) * 0.5f; + } + } + + SIMD_EXPORT void limit_audio(const float *src, float *dst, float threshold, float *limiter_state, float sample_rate, + size_t num_samples) + { + ASSERT_ALIGNED(src, ALIGNMENT_AVX); + ASSERT_ALIGNED(dst, ALIGNMENT_AVX); + + if (num_samples == 0 || threshold <= 0.0f) + { + return; + } + + constexpr size_t simd_width = 8; + constexpr size_t unroll_factor = 4; + constexpr float release_time = 0.05f; + float release_coeff = std::exp(-1.0f / (release_time * sample_rate)); + + float current_gain = limiter_state != nullptr ? *limiter_state : 1.0f; + + size_t i = 0; + + for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) + { + auto a0 = _mm256_load_ps(&src[i]); + auto a1 = _mm256_load_ps(&src[i + 8]); + auto a2 = _mm256_load_ps(&src[i + 16]); + auto a3 = _mm256_load_ps(&src[i + 24]); + + auto abs_a0 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a0); + auto abs_a1 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a1); + auto abs_a2 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a2); + auto abs_a3 = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a3); + + auto max_abs = _mm256_max_ps(_mm256_max_ps(abs_a0, abs_a1), + _mm256_max_ps(abs_a2, abs_a3)); + + auto high = _mm256_extractf128_ps(max_abs, 1); + auto low = _mm256_extractf128_ps(max_abs, 0); + auto max_lane = _mm_max_ps(high, low); + + auto temp1 = _mm_shuffle_ps(max_lane, max_lane, _MM_SHUFFLE(2, 3, 0, 1)); + auto max1 = _mm_max_ps(max_lane, temp1); + auto temp2 = _mm_shuffle_ps(max1, max1, _MM_SHUFFLE(1, 0, 3, 2)); + auto final_max = _mm_max_ps(max1, temp2); + + float max_sample = _mm_cvtss_f32(final_max); + + float target_gain = max_sample > threshold ? threshold / max_sample : 1.0f; + + if (target_gain < current_gain) + { + current_gain = target_gain; + } + else + { + current_gain = target_gain + (current_gain - target_gain) * release_coeff; + } + + auto gain_vec = _mm256_set1_ps(current_gain); + + auto result0 = _mm256_mul_ps(a0, gain_vec); + auto result1 = _mm256_mul_ps(a1, gain_vec); + auto result2 = _mm256_mul_ps(a2, gain_vec); + auto result3 = _mm256_mul_ps(a3, gain_vec); + + _mm256_store_ps(&dst[i], result0); + _mm256_store_ps(&dst[i + 8], result1); + _mm256_store_ps(&dst[i + 16], result2); + _mm256_store_ps(&dst[i + 24], result3); + } + + for (; i + simd_width <= num_samples; i += simd_width) + { + auto a = _mm256_load_ps(&src[i]); + auto abs_a = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a); + + auto high = _mm256_extractf128_ps(abs_a, 1); + auto low = _mm256_extractf128_ps(abs_a, 0); + auto max_lane = _mm_max_ps(high, low); + + auto temp1 = _mm_shuffle_ps(max_lane, max_lane, _MM_SHUFFLE(2, 3, 0, 1)); + auto max1 = _mm_max_ps(max_lane, temp1); + auto temp2 = _mm_shuffle_ps(max1, max1, _MM_SHUFFLE(1, 0, 3, 2)); + auto final_max = _mm_max_ps(max1, temp2); + + float max_sample = _mm_cvtss_f32(final_max); + + float target_gain = max_sample > threshold ? threshold / max_sample : 1.0f; + + if (target_gain < current_gain) + { + current_gain = target_gain; + } + else + { + current_gain = target_gain + (current_gain - target_gain) * release_coeff; + } + + auto gain_vec = _mm256_set1_ps(current_gain); + auto result = _mm256_mul_ps(a, gain_vec); + _mm256_store_ps(&dst[i], result); + } + + for (; i < num_samples; ++i) + { + float sample = src[i]; + float abs_sample = std::fabs(sample); + + float target_gain = abs_sample > threshold ? threshold / abs_sample : 1.0f; + + if (target_gain < current_gain) + { + current_gain = target_gain; + } + else + { + current_gain = target_gain + (current_gain - target_gain) * release_coeff; + } + + dst[i] = sample * current_gain; + } + + if (limiter_state != nullptr) + { + *limiter_state = current_gain; + } + } + + SIMD_EXPORT void fade_audio(const float *src, float *dst, size_t fade_in_samples, size_t fade_out_samples, + size_t num_samples) + { + ASSERT_ALIGNED(src, ALIGNMENT_AVX); + ASSERT_ALIGNED(dst, ALIGNMENT_AVX); + + if (num_samples == 0) + { + return; + } + + constexpr size_t simd_width = 8; + constexpr size_t unroll_factor = 4; + size_t i = 0; + + if (fade_in_samples > 0) + { + const float fade_in_step = 1.0f / static_cast(fade_in_samples); + + for (; i + simd_width * unroll_factor <= std::min(fade_in_samples, num_samples); i += simd_width * + unroll_factor) + { + auto gain0 = _mm256_set_ps((i + 7) * fade_in_step, (i + 6) * fade_in_step, (i + 5) * fade_in_step, + (i + 4) * fade_in_step, + (i + 3) * fade_in_step, (i + 2) * fade_in_step, (i + 1) * fade_in_step, + i * fade_in_step); + auto gain1 = _mm256_set_ps((i + 15) * fade_in_step, (i + 14) * fade_in_step, (i + 13) * fade_in_step, + (i + 12) * fade_in_step, + (i + 11) * fade_in_step, (i + 10) * fade_in_step, (i + 9) * fade_in_step, + (i + 8) * fade_in_step); + auto gain2 = _mm256_set_ps((i + 23) * fade_in_step, (i + 22) * fade_in_step, (i + 21) * fade_in_step, + (i + 20) * fade_in_step, + (i + 19) * fade_in_step, (i + 18) * fade_in_step, (i + 17) * fade_in_step, + (i + 16) * fade_in_step); + auto gain3 = _mm256_set_ps((i + 31) * fade_in_step, (i + 30) * fade_in_step, (i + 29) * fade_in_step, + (i + 28) * fade_in_step, + (i + 27) * fade_in_step, (i + 26) * fade_in_step, (i + 25) * fade_in_step, + (i + 24) * fade_in_step); + + auto a0 = _mm256_load_ps(&src[i]); + auto a1 = _mm256_load_ps(&src[i + 8]); + auto a2 = _mm256_load_ps(&src[i + 16]); + auto a3 = _mm256_load_ps(&src[i + 24]); + + auto result0 = _mm256_mul_ps(a0, gain0); + auto result1 = _mm256_mul_ps(a1, gain1); + auto result2 = _mm256_mul_ps(a2, gain2); + auto result3 = _mm256_mul_ps(a3, gain3); + + _mm256_store_ps(&dst[i], result0); + _mm256_store_ps(&dst[i + 8], result1); + _mm256_store_ps(&dst[i + 16], result2); + _mm256_store_ps(&dst[i + 24], result3); + } + + for (; i < std::min(fade_in_samples, num_samples); ++i) + { + const float gain = static_cast(i) / static_cast(fade_in_samples); + dst[i] = src[i] * gain; + } + } + + const size_t middle_start = fade_in_samples; + const size_t middle_end = num_samples > fade_out_samples ? num_samples - fade_out_samples : 0; + + if (middle_end > middle_start) + { + for (size_t j = middle_start; j + simd_width * unroll_factor <= middle_end; j += simd_width * + unroll_factor) + { + auto a0 = _mm256_load_ps(&src[j]); + auto a1 = _mm256_load_ps(&src[j + 8]); + auto a2 = _mm256_load_ps(&src[j + 16]); + auto a3 = _mm256_load_ps(&src[j + 24]); + + _mm256_store_ps(&dst[j], a0); + _mm256_store_ps(&dst[j + 8], a1); + _mm256_store_ps(&dst[j + 16], a2); + _mm256_store_ps(&dst[j + 24], a3); + } + + for (size_t j = middle_start + ((middle_end - middle_start) / (simd_width * unroll_factor)) * (simd_width * + unroll_factor); + j < middle_end; ++j) + { + dst[j] = src[j]; + } + } + + if (fade_out_samples > 0 && num_samples > fade_out_samples) + { + const size_t fade_out_start = num_samples - fade_out_samples; + const float fade_out_step = 1.0f / static_cast(fade_out_samples); + + for (size_t j = fade_out_start; j + simd_width * unroll_factor <= num_samples; j += simd_width * + unroll_factor) + { + const size_t fade_out_offset = j - fade_out_start; + auto gain0 = _mm256_set_ps(1.0f - (fade_out_offset + 7) * fade_out_step, + 1.0f - (fade_out_offset + 6) * fade_out_step, + 1.0f - (fade_out_offset + 5) * fade_out_step, + 1.0f - (fade_out_offset + 4) * fade_out_step, + 1.0f - (fade_out_offset + 3) * fade_out_step, + 1.0f - (fade_out_offset + 2) * fade_out_step, + 1.0f - (fade_out_offset + 1) * fade_out_step, + 1.0f - fade_out_offset * fade_out_step); + auto gain1 = _mm256_set_ps(1.0f - (fade_out_offset + 15) * fade_out_step, + 1.0f - (fade_out_offset + 14) * fade_out_step, + 1.0f - (fade_out_offset + 13) * fade_out_step, + 1.0f - (fade_out_offset + 12) * fade_out_step, + 1.0f - (fade_out_offset + 11) * fade_out_step, + 1.0f - (fade_out_offset + 10) * fade_out_step, + 1.0f - (fade_out_offset + 9) * fade_out_step, + 1.0f - (fade_out_offset + 8) * fade_out_step); + auto gain2 = _mm256_set_ps(1.0f - (fade_out_offset + 23) * fade_out_step, + 1.0f - (fade_out_offset + 22) * fade_out_step, + 1.0f - (fade_out_offset + 21) * fade_out_step, + 1.0f - (fade_out_offset + 20) * fade_out_step, + 1.0f - (fade_out_offset + 19) * fade_out_step, + 1.0f - (fade_out_offset + 18) * fade_out_step, + 1.0f - (fade_out_offset + 17) * fade_out_step, + 1.0f - (fade_out_offset + 16) * fade_out_step); + auto gain3 = _mm256_set_ps(1.0f - (fade_out_offset + 31) * fade_out_step, + 1.0f - (fade_out_offset + 30) * fade_out_step, + 1.0f - (fade_out_offset + 29) * fade_out_step, + 1.0f - (fade_out_offset + 28) * fade_out_step, + 1.0f - (fade_out_offset + 27) * fade_out_step, + 1.0f - (fade_out_offset + 26) * fade_out_step, + 1.0f - (fade_out_offset + 25) * fade_out_step, + 1.0f - (fade_out_offset + 24) * fade_out_step); + + auto a0 = _mm256_load_ps(&src[j]); + auto a1 = _mm256_load_ps(&src[j + 8]); + auto a2 = _mm256_load_ps(&src[j + 16]); + auto a3 = _mm256_load_ps(&src[j + 24]); + + auto result0 = _mm256_mul_ps(a0, gain0); + auto result1 = _mm256_mul_ps(a1, gain1); + auto result2 = _mm256_mul_ps(a2, gain2); + auto result3 = _mm256_mul_ps(a3, gain3); + + _mm256_store_ps(&dst[j], result0); + _mm256_store_ps(&dst[j + 8], result1); + _mm256_store_ps(&dst[j + 16], result2); + _mm256_store_ps(&dst[j + 24], result3); + } + + for (size_t j = fade_out_start + ((fade_out_samples / (simd_width * unroll_factor)) * (simd_width * + unroll_factor)); + j < num_samples; ++j) + { + const size_t fade_out_offset = j - fade_out_start; + const float gain = 1.0f - static_cast(fade_out_offset) / static_cast(fade_out_samples); + dst[j] = src[j] * gain; + } + } + } + + SIMD_EXPORT void simple_eq(const float *src, float *dst, float low_gain, float mid_gain, float high_gain, float *eq_state, + size_t num_samples) + { + ASSERT_ALIGNED(src, ALIGNMENT_AVX); + ASSERT_ALIGNED(dst, ALIGNMENT_AVX); + + if (num_samples == 0) + { + return; + } + + constexpr size_t simd_width = 8; + constexpr size_t unroll_factor = 4; + + constexpr float low_cutoff = 0.02f; + constexpr float high_cutoff = 0.1f; + constexpr float mid_factor = 0.7f; + + float low_state = eq_state != nullptr ? *eq_state : 0.0f; + float high_state = eq_state != nullptr ? *(eq_state + 1) : 0.0f; + + const auto low_gain_vec = _mm256_set1_ps(low_gain); + const auto mid_gain_vec = _mm256_set1_ps(mid_gain); + const auto high_gain_vec = _mm256_set1_ps(high_gain); + const auto low_cutoff_vec = _mm256_set1_ps(low_cutoff); + const auto high_cutoff_vec = _mm256_set1_ps(high_cutoff); + const auto mid_factor_vec = _mm256_set1_ps(mid_factor); + const auto one_minus_low_cutoff_vec = _mm256_set1_ps(1.0f - low_cutoff); + const auto one_minus_high_cutoff_vec = _mm256_set1_ps(1.0f - high_cutoff); + + size_t i = 0; + + for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) + { + auto input0 = _mm256_load_ps(&src[i]); + auto input1 = _mm256_load_ps(&src[i + 8]); + auto input2 = _mm256_load_ps(&src[i + 16]); + auto input3 = _mm256_load_ps(&src[i + 24]); + + auto low_state_vec = _mm256_set1_ps(low_state); + auto low0 = _mm256_add_ps(_mm256_mul_ps(input0, low_cutoff_vec), + _mm256_mul_ps(low_state_vec, one_minus_low_cutoff_vec)); + auto low1 = _mm256_add_ps(_mm256_mul_ps(input1, low_cutoff_vec), + _mm256_mul_ps(low0, one_minus_low_cutoff_vec)); + auto low2 = _mm256_add_ps(_mm256_mul_ps(input2, low_cutoff_vec), + _mm256_mul_ps(low1, one_minus_low_cutoff_vec)); + auto low3 = _mm256_add_ps(_mm256_mul_ps(input3, low_cutoff_vec), + _mm256_mul_ps(low2, one_minus_low_cutoff_vec)); + + auto high0 = _mm256_sub_ps(input0, low0); + auto high1 = _mm256_sub_ps(input1, low1); + auto high2 = _mm256_sub_ps(input2, low2); + auto high3 = _mm256_sub_ps(input3, low3); + + auto high_state_vec = _mm256_set1_ps(high_state); + high0 = _mm256_add_ps(_mm256_mul_ps(high0, high_cutoff_vec), + _mm256_mul_ps(high_state_vec, one_minus_high_cutoff_vec)); + high1 = _mm256_add_ps(_mm256_mul_ps(high1, high_cutoff_vec), + _mm256_mul_ps(high0, one_minus_high_cutoff_vec)); + high2 = _mm256_add_ps(_mm256_mul_ps(high2, high_cutoff_vec), + _mm256_mul_ps(high1, one_minus_high_cutoff_vec)); + high3 = _mm256_add_ps(_mm256_mul_ps(high3, high_cutoff_vec), + _mm256_mul_ps(high2, one_minus_high_cutoff_vec)); + + auto mid0 = _mm256_mul_ps(_mm256_sub_ps(_mm256_sub_ps(input0, low0), high0), mid_factor_vec); + auto mid1 = _mm256_mul_ps(_mm256_sub_ps(_mm256_sub_ps(input1, low1), high1), mid_factor_vec); + auto mid2 = _mm256_mul_ps(_mm256_sub_ps(_mm256_sub_ps(input2, low2), high2), mid_factor_vec); + auto mid3 = _mm256_mul_ps(_mm256_sub_ps(_mm256_sub_ps(input3, low3), high3), mid_factor_vec); + + auto result0 = _mm256_add_ps( + _mm256_add_ps(_mm256_mul_ps(low0, low_gain_vec), _mm256_mul_ps(mid0, mid_gain_vec)), + _mm256_mul_ps(high0, high_gain_vec)); + auto result1 = _mm256_add_ps( + _mm256_add_ps(_mm256_mul_ps(low1, low_gain_vec), _mm256_mul_ps(mid1, mid_gain_vec)), + _mm256_mul_ps(high1, high_gain_vec)); + auto result2 = _mm256_add_ps( + _mm256_add_ps(_mm256_mul_ps(low2, low_gain_vec), _mm256_mul_ps(mid2, mid_gain_vec)), + _mm256_mul_ps(high2, high_gain_vec)); + auto result3 = _mm256_add_ps( + _mm256_add_ps(_mm256_mul_ps(low3, low_gain_vec), _mm256_mul_ps(mid3, mid_gain_vec)), + _mm256_mul_ps(high3, high_gain_vec)); + + _mm256_store_ps(&dst[i], result0); + _mm256_store_ps(&dst[i + 8], result1); + _mm256_store_ps(&dst[i + 16], result2); + _mm256_store_ps(&dst[i + 24], result3); + + auto low_temp = _mm256_extractf128_ps(low3, 1); + low_state = _mm_cvtss_f32(_mm_shuffle_ps(low_temp, low_temp, _MM_SHUFFLE(3, 3, 3, 3))); + auto high_temp = _mm256_extractf128_ps(high3, 1); + high_state = _mm_cvtss_f32(_mm_shuffle_ps(high_temp, high_temp, _MM_SHUFFLE(3, 3, 3, 3))); + } + + for (; i < num_samples; ++i) + { + float input = src[i]; + + float low_output = low_cutoff * input + (1.0f - low_cutoff) * low_state; + low_state = low_output; + + float high_input = input - low_output; + float high_output = high_cutoff * high_input + (1.0f - high_cutoff) * high_state; + high_state = high_output; + + float mid_output = (input - low_output - high_output) * mid_factor; + + dst[i] = low_output * low_gain + mid_output * mid_gain + high_output * high_gain; + } + + if (eq_state != nullptr) + { + *eq_state = low_state; + *(eq_state + 1) = high_state; + } + } +} +#endif \ No newline at end of file diff --git a/src/simd/simd_avx512/CMakeLists.txt b/src/simd/simd_avx512/CMakeLists.txt new file mode 100644 index 0000000..430a5f7 --- /dev/null +++ b/src/simd/simd_avx512/CMakeLists.txt @@ -0,0 +1,7 @@ +project(alicho_simd_avx512) + +if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64") + simple_library(SHARED) + target_compile_options(${PROJECT_NAME} PRIVATE -mavx512f -mavx512bw -mavx512vl -mavx512dq) + target_link_libraries(${PROJECT_NAME} PUBLIC alicho_simd_interface) +endif() \ No newline at end of file diff --git a/src/simd/simd_avx512/simd_func.cpp b/src/simd/simd_avx512/simd_func.cpp new file mode 100644 index 0000000..f3bbb92 --- /dev/null +++ b/src/simd/simd_avx512/simd_func.cpp @@ -0,0 +1,756 @@ +/** + * @file x86_avx512_audio_processing_func.cpp + * @brief x86 AVX-512音频处理函数实现 + */ + +#include "simd_interface.h" + +#include +#include +#include "aligned_allocator.h" + +extern "C" +{ + SIMD_EXPORT void fill_buffer(float *buffer, float value, size_t num_samples) + { + ASSERT_ALIGNED(buffer, ALIGNMENT_AVX512); + + constexpr size_t simd_width = 16; + constexpr size_t unroll_factor = 4; + size_t i = 0; + + auto value_vec = _mm512_set1_ps(value); + + for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) + { + _mm512_store_ps(&buffer[i], value_vec); + _mm512_store_ps(&buffer[i + 16], value_vec); + _mm512_store_ps(&buffer[i + 32], value_vec); + _mm512_store_ps(&buffer[i + 48], value_vec); + } + + for (; i + simd_width <= num_samples; i += simd_width) + { + _mm512_store_ps(&buffer[i], value_vec); + } + + for (; i < num_samples; ++i) + { + buffer[i] = value; + } + } + + SIMD_EXPORT void mix_audio(const float *src1, const float *src2, float *dst, size_t num_samples) + { + ASSERT_ALIGNED(src1, ALIGNMENT_AVX512); + ASSERT_ALIGNED(src2, ALIGNMENT_AVX512); + ASSERT_ALIGNED(dst, ALIGNMENT_AVX512); + + constexpr size_t simd_width = 16; + constexpr size_t unroll_factor = 4; + size_t i = 0; + + for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) + { + const auto a0 = _mm512_load_ps(&src1[i]); + const auto a1 = _mm512_load_ps(&src1[i + 16]); + const auto a2 = _mm512_load_ps(&src1[i + 32]); + const auto a3 = _mm512_load_ps(&src1[i + 48]); + + const auto b0 = _mm512_load_ps(&src2[i]); + const auto b1 = _mm512_load_ps(&src2[i + 16]); + const auto b2 = _mm512_load_ps(&src2[i + 32]); + const auto b3 = _mm512_load_ps(&src2[i + 48]); + + const auto result0 = _mm512_add_ps(a0, b0); + const auto result1 = _mm512_add_ps(a1, b1); + const auto result2 = _mm512_add_ps(a2, b2); + const auto result3 = _mm512_add_ps(a3, b3); + + _mm512_store_ps(&dst[i], result0); + _mm512_store_ps(&dst[i + 16], result1); + _mm512_store_ps(&dst[i + 32], result2); + _mm512_store_ps(&dst[i + 48], result3); + } + + for (; i + simd_width <= num_samples; i += simd_width) + { + auto a = _mm512_load_ps(&src1[i]); + auto b = _mm512_load_ps(&src2[i]); + auto result = _mm512_add_ps(a, b); + _mm512_store_ps(&dst[i], result); + } + + for (; i < num_samples; ++i) + { + dst[i] = src1[i] + src2[i]; + } + } + + SIMD_EXPORT void apply_gain(const float *src, float *dst, float gain, size_t num_samples) + { + ASSERT_ALIGNED(src, ALIGNMENT_AVX512); + ASSERT_ALIGNED(dst, ALIGNMENT_AVX512); + + constexpr size_t simd_width = 16; + constexpr size_t unroll_factor = 4; + size_t i = 0; + + auto gain_vec = _mm512_set1_ps(gain); + + for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) + { + auto a0 = _mm512_load_ps(&src[i]); + auto a1 = _mm512_load_ps(&src[i + 16]); + auto a2 = _mm512_load_ps(&src[i + 32]); + auto a3 = _mm512_load_ps(&src[i + 48]); + + auto result0 = _mm512_mul_ps(a0, gain_vec); + auto result1 = _mm512_mul_ps(a1, gain_vec); + auto result2 = _mm512_mul_ps(a2, gain_vec); + auto result3 = _mm512_mul_ps(a3, gain_vec); + + _mm512_store_ps(&dst[i], result0); + _mm512_store_ps(&dst[i + 16], result1); + _mm512_store_ps(&dst[i + 32], result2); + _mm512_store_ps(&dst[i + 48], result3); + } + + for (; i + simd_width <= num_samples; i += simd_width) + { + auto a = _mm512_load_ps(&src[i]); + auto result = _mm512_mul_ps(a, gain_vec); + _mm512_store_ps(&dst[i], result); + } + + for (; i < num_samples; ++i) + { + dst[i] = src[i] * gain; + } + } + + SIMD_EXPORT float calculate_rms(const float *src, size_t num_samples) + { + ASSERT_ALIGNED(src, ALIGNMENT_AVX512); + + constexpr size_t simd_width = 16; + constexpr size_t unroll_factor = 4; + size_t i = 0; + auto sum_squares0 = _mm512_setzero_ps(); + auto sum_squares1 = _mm512_setzero_ps(); + auto sum_squares2 = _mm512_setzero_ps(); + auto sum_squares3 = _mm512_setzero_ps(); + + for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) + { + const auto a0 = _mm512_load_ps(&src[i]); + const auto a1 = _mm512_load_ps(&src[i + 16]); + const auto a2 = _mm512_load_ps(&src[i + 32]); + const auto a3 = _mm512_load_ps(&src[i + 48]); + + const auto squared0 = _mm512_mul_ps(a0, a0); + const auto squared1 = _mm512_mul_ps(a1, a1); + const auto squared2 = _mm512_mul_ps(a2, a2); + const auto squared3 = _mm512_mul_ps(a3, a3); + + sum_squares0 = _mm512_add_ps(sum_squares0, squared0); + sum_squares1 = _mm512_add_ps(sum_squares1, squared1); + sum_squares2 = _mm512_add_ps(sum_squares2, squared2); + sum_squares3 = _mm512_add_ps(sum_squares3, squared3); + } + + auto sum_squares = _mm512_add_ps(_mm512_add_ps(sum_squares0, sum_squares1), + _mm512_add_ps(sum_squares2, sum_squares3)); + + for (; i + simd_width <= num_samples; i += simd_width) + { + const auto a = _mm512_load_ps(&src[i]); + const auto squared = _mm512_mul_ps(a, a); + sum_squares = _mm512_add_ps(sum_squares, squared); + } + + double total_sum = _mm512_reduce_add_ps(sum_squares); + + for (; i < num_samples; ++i) + { + total_sum += static_cast(src[i]) * static_cast(src[i]); + } + + return static_cast(std::sqrt(total_sum / static_cast(num_samples))); + } + + SIMD_EXPORT float calculate_peak(const float *src, size_t num_samples) + { + ASSERT_ALIGNED(src, ALIGNMENT_AVX512); + + constexpr size_t simd_width = 16; + constexpr size_t unroll_factor = 4; + size_t i = 0; + auto peak_vec0 = _mm512_setzero_ps(); + auto peak_vec1 = _mm512_setzero_ps(); + auto peak_vec2 = _mm512_setzero_ps(); + auto peak_vec3 = _mm512_setzero_ps(); + + for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) + { + const auto a0 = _mm512_load_ps(&src[i]); + const auto a1 = _mm512_load_ps(&src[i + 16]); + const auto a2 = _mm512_load_ps(&src[i + 32]); + const auto a3 = _mm512_load_ps(&src[i + 48]); + + const auto abs_a0 = _mm512_andnot_ps(_mm512_set1_ps(-0.0f), a0); + const auto abs_a1 = _mm512_andnot_ps(_mm512_set1_ps(-0.0f), a1); + const auto abs_a2 = _mm512_andnot_ps(_mm512_set1_ps(-0.0f), a2); + const auto abs_a3 = _mm512_andnot_ps(_mm512_set1_ps(-0.0f), a3); + + peak_vec0 = _mm512_max_ps(peak_vec0, abs_a0); + peak_vec1 = _mm512_max_ps(peak_vec1, abs_a1); + peak_vec2 = _mm512_max_ps(peak_vec2, abs_a2); + peak_vec3 = _mm512_max_ps(peak_vec3, abs_a3); + } + + auto peak_vec = _mm512_max_ps(_mm512_max_ps(peak_vec0, peak_vec1), + _mm512_max_ps(peak_vec2, peak_vec3)); + + for (; i + simd_width <= num_samples; i += simd_width) + { + const auto a = _mm512_load_ps(&src[i]); + const auto abs_a = _mm512_andnot_ps(_mm512_set1_ps(-0.0f), a); + peak_vec = _mm512_max_ps(peak_vec, abs_a); + } + + float peak = _mm512_reduce_max_ps(peak_vec); + + for (; i < num_samples; ++i) + { + float abs_sample = std::fabs(src[i]); + if (abs_sample > peak) + { + peak = abs_sample; + } + } + + return peak; + } + + SIMD_EXPORT void normalize_audio(const float *src, float *dst, float target_peak, size_t num_samples) + { + ASSERT_ALIGNED(src, ALIGNMENT_AVX512); + ASSERT_ALIGNED(dst, ALIGNMENT_AVX512); + + if (num_samples == 0 || target_peak <= 0.0f) + { + return; + } + + const float current_peak = calculate_peak(src, num_samples); + + if (current_peak < 1e-10f) + { + constexpr size_t simd_width = 16; + constexpr size_t unroll_factor = 4; + auto zero_vec = _mm512_setzero_ps(); + size_t i = 0; + + for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) + { + _mm512_store_ps(&dst[i], zero_vec); + _mm512_store_ps(&dst[i + 16], zero_vec); + _mm512_store_ps(&dst[i + 32], zero_vec); + _mm512_store_ps(&dst[i + 48], zero_vec); + } + + for (; i + simd_width <= num_samples; i += simd_width) + { + _mm512_store_ps(&dst[i], zero_vec); + } + + for (; i < num_samples; ++i) + { + dst[i] = 0.0f; + } + return; + } + + const float gain_factor = target_peak / current_peak; + apply_gain(src, dst, gain_factor, num_samples); + } + + SIMD_EXPORT void stereo_to_mono(const float *stereo_src, float *mono_dst, size_t num_stereo_samples) + { + ASSERT_ALIGNED(stereo_src, ALIGNMENT_AVX512); + ASSERT_ALIGNED(mono_dst, ALIGNMENT_AVX512); + + if (num_stereo_samples == 0) + { + return; + } + + constexpr size_t simd_width = 16; + constexpr size_t unroll_factor = 4; + const auto half_vec = _mm512_set1_ps(0.5f); + size_t stereo_idx = 0; + size_t mono_idx = 0; + + for (; stereo_idx + simd_width * 2 * unroll_factor <= num_stereo_samples * 2; + stereo_idx += simd_width * 2 * unroll_factor, mono_idx += simd_width * unroll_factor) + { + auto stereo0 = _mm512_load_ps(&stereo_src[stereo_idx]); + auto stereo1 = _mm512_load_ps(&stereo_src[stereo_idx + 16]); + auto stereo2 = _mm512_load_ps(&stereo_src[stereo_idx + 32]); + auto stereo3 = _mm512_load_ps(&stereo_src[stereo_idx + 48]); + auto stereo4 = _mm512_load_ps(&stereo_src[stereo_idx + 64]); + auto stereo5 = _mm512_load_ps(&stereo_src[stereo_idx + 80]); + auto stereo6 = _mm512_load_ps(&stereo_src[stereo_idx + 96]); + auto stereo7 = _mm512_load_ps(&stereo_src[stereo_idx + 112]); + + const auto even_mask = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + const auto odd_mask = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1); + + auto left0 = _mm512_permutex2var_ps(stereo0, even_mask, stereo1); + auto right0 = _mm512_permutex2var_ps(stereo0, odd_mask, stereo1); + auto left1 = _mm512_permutex2var_ps(stereo2, even_mask, stereo3); + auto right1 = _mm512_permutex2var_ps(stereo2, odd_mask, stereo3); + auto left2 = _mm512_permutex2var_ps(stereo4, even_mask, stereo5); + auto right2 = _mm512_permutex2var_ps(stereo4, odd_mask, stereo5); + auto left3 = _mm512_permutex2var_ps(stereo6, even_mask, stereo7); + auto right3 = _mm512_permutex2var_ps(stereo6, odd_mask, stereo7); + + auto mono0 = _mm512_mul_ps(_mm512_add_ps(left0, right0), half_vec); + auto mono1 = _mm512_mul_ps(_mm512_add_ps(left1, right1), half_vec); + auto mono2 = _mm512_mul_ps(_mm512_add_ps(left2, right2), half_vec); + auto mono3 = _mm512_mul_ps(_mm512_add_ps(left3, right3), half_vec); + + _mm512_store_ps(&mono_dst[mono_idx], mono0); + _mm512_store_ps(&mono_dst[mono_idx + 16], mono1); + _mm512_store_ps(&mono_dst[mono_idx + 32], mono2); + _mm512_store_ps(&mono_dst[mono_idx + 48], mono3); + } + + for (size_t i = stereo_idx / 2; i < num_stereo_samples; ++i) + { + const float left = stereo_src[i * 2]; + const float right = stereo_src[i * 2 + 1]; + mono_dst[i] = (left + right) * 0.5f; + } + } + + SIMD_EXPORT void limit_audio(const float *src, float *dst, float threshold, float *limiter_state, float sample_rate, + size_t num_samples) + { + ASSERT_ALIGNED(src, ALIGNMENT_AVX512); + ASSERT_ALIGNED(dst, ALIGNMENT_AVX512); + + if (num_samples == 0 || threshold <= 0.0f) + { + return; + } + + constexpr size_t simd_width = 16; + constexpr size_t unroll_factor = 4; + constexpr float release_time = 0.05f; + float release_coeff = std::exp(-1.0f / (release_time * sample_rate)); + + float current_gain = limiter_state != nullptr ? *limiter_state : 1.0f; + + size_t i = 0; + + for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) + { + auto a0 = _mm512_load_ps(&src[i]); + auto a1 = _mm512_load_ps(&src[i + 16]); + auto a2 = _mm512_load_ps(&src[i + 32]); + auto a3 = _mm512_load_ps(&src[i + 48]); + + auto abs_a0 = _mm512_abs_ps(a0); + auto abs_a1 = _mm512_abs_ps(a1); + auto abs_a2 = _mm512_abs_ps(a2); + auto abs_a3 = _mm512_abs_ps(a3); + + auto max_abs = _mm512_max_ps(_mm512_max_ps(abs_a0, abs_a1), + _mm512_max_ps(abs_a2, abs_a3)); + + float max_sample = _mm512_reduce_max_ps(max_abs); + + float target_gain = max_sample > threshold ? threshold / max_sample : 1.0f; + + if (target_gain < current_gain) + { + current_gain = target_gain; + } + else + { + current_gain = target_gain + (current_gain - target_gain) * release_coeff; + } + + auto gain_vec = _mm512_set1_ps(current_gain); + + auto result0 = _mm512_mul_ps(a0, gain_vec); + auto result1 = _mm512_mul_ps(a1, gain_vec); + auto result2 = _mm512_mul_ps(a2, gain_vec); + auto result3 = _mm512_mul_ps(a3, gain_vec); + + _mm512_store_ps(&dst[i], result0); + _mm512_store_ps(&dst[i + 16], result1); + _mm512_store_ps(&dst[i + 32], result2); + _mm512_store_ps(&dst[i + 48], result3); + } + + for (; i + simd_width <= num_samples; i += simd_width) + { + auto a = _mm512_load_ps(&src[i]); + auto abs_a = _mm512_abs_ps(a); + + float max_sample = _mm512_reduce_max_ps(abs_a); + + float target_gain = max_sample > threshold ? threshold / max_sample : 1.0f; + + if (target_gain < current_gain) + { + current_gain = target_gain; + } + else + { + current_gain = target_gain + (current_gain - target_gain) * release_coeff; + } + + auto gain_vec = _mm512_set1_ps(current_gain); + auto result = _mm512_mul_ps(a, gain_vec); + _mm512_store_ps(&dst[i], result); + } + + for (; i < num_samples; ++i) + { + float sample = src[i]; + float abs_sample = std::fabs(sample); + + float target_gain = abs_sample > threshold ? threshold / abs_sample : 1.0f; + + if (target_gain < current_gain) + { + current_gain = target_gain; + } + else + { + current_gain = target_gain + (current_gain - target_gain) * release_coeff; + } + + dst[i] = sample * current_gain; + } + + if (limiter_state != nullptr) + { + *limiter_state = current_gain; + } + } + + SIMD_EXPORT void fade_audio(const float *src, float *dst, size_t fade_in_samples, size_t fade_out_samples, + size_t num_samples) + { + ASSERT_ALIGNED(src, ALIGNMENT_AVX512); + ASSERT_ALIGNED(dst, ALIGNMENT_AVX512); + + if (num_samples == 0) + { + return; + } + + constexpr size_t simd_width = 16; + constexpr size_t unroll_factor = 4; + size_t i = 0; + + if (fade_in_samples > 0) + { + const float fade_in_step = 1.0f / static_cast(fade_in_samples); + + for (; i + simd_width * unroll_factor <= std::min(fade_in_samples, num_samples); i += simd_width * + unroll_factor) + { + auto gain0 = _mm512_set_ps((i + 15) * fade_in_step, (i + 14) * fade_in_step, (i + 13) * fade_in_step, + (i + 12) * fade_in_step, + (i + 11) * fade_in_step, (i + 10) * fade_in_step, (i + 9) * fade_in_step, + (i + 8) * fade_in_step, + (i + 7) * fade_in_step, (i + 6) * fade_in_step, (i + 5) * fade_in_step, + (i + 4) * fade_in_step, + (i + 3) * fade_in_step, (i + 2) * fade_in_step, (i + 1) * fade_in_step, + i * fade_in_step); + auto gain1 = _mm512_set_ps((i + 31) * fade_in_step, (i + 30) * fade_in_step, (i + 29) * fade_in_step, + (i + 28) * fade_in_step, + (i + 27) * fade_in_step, (i + 26) * fade_in_step, (i + 25) * fade_in_step, + (i + 24) * fade_in_step, + (i + 23) * fade_in_step, (i + 22) * fade_in_step, (i + 21) * fade_in_step, + (i + 20) * fade_in_step, + (i + 19) * fade_in_step, (i + 18) * fade_in_step, (i + 17) * fade_in_step, + (i + 16) * fade_in_step); + auto gain2 = _mm512_set_ps((i + 47) * fade_in_step, (i + 46) * fade_in_step, (i + 45) * fade_in_step, + (i + 44) * fade_in_step, + (i + 43) * fade_in_step, (i + 42) * fade_in_step, (i + 41) * fade_in_step, + (i + 40) * fade_in_step, + (i + 39) * fade_in_step, (i + 38) * fade_in_step, (i + 37) * fade_in_step, + (i + 36) * fade_in_step, + (i + 35) * fade_in_step, (i + 34) * fade_in_step, (i + 33) * fade_in_step, + (i + 32) * fade_in_step); + auto gain3 = _mm512_set_ps((i + 63) * fade_in_step, (i + 62) * fade_in_step, (i + 61) * fade_in_step, + (i + 60) * fade_in_step, + (i + 59) * fade_in_step, (i + 58) * fade_in_step, (i + 57) * fade_in_step, + (i + 56) * fade_in_step, + (i + 55) * fade_in_step, (i + 54) * fade_in_step, (i + 53) * fade_in_step, + (i + 52) * fade_in_step, + (i + 51) * fade_in_step, (i + 50) * fade_in_step, (i + 49) * fade_in_step, + (i + 48) * fade_in_step); + + auto a0 = _mm512_load_ps(&src[i]); + auto a1 = _mm512_load_ps(&src[i + 16]); + auto a2 = _mm512_load_ps(&src[i + 32]); + auto a3 = _mm512_load_ps(&src[i + 48]); + + auto result0 = _mm512_mul_ps(a0, gain0); + auto result1 = _mm512_mul_ps(a1, gain1); + auto result2 = _mm512_mul_ps(a2, gain2); + auto result3 = _mm512_mul_ps(a3, gain3); + + _mm512_store_ps(&dst[i], result0); + _mm512_store_ps(&dst[i + 16], result1); + _mm512_store_ps(&dst[i + 32], result2); + _mm512_store_ps(&dst[i + 48], result3); + } + + for (; i < std::min(fade_in_samples, num_samples); ++i) + { + const float gain = static_cast(i) / static_cast(fade_in_samples); + dst[i] = src[i] * gain; + } + } + + const size_t middle_start = fade_in_samples; + const size_t middle_end = num_samples > fade_out_samples ? num_samples - fade_out_samples : 0; + + if (middle_end > middle_start) + { + for (size_t j = middle_start; j + simd_width * unroll_factor <= middle_end; j += simd_width * + unroll_factor) + { + auto a0 = _mm512_load_ps(&src[j]); + auto a1 = _mm512_load_ps(&src[j + 16]); + auto a2 = _mm512_load_ps(&src[j + 32]); + auto a3 = _mm512_load_ps(&src[j + 48]); + + _mm512_store_ps(&dst[j], a0); + _mm512_store_ps(&dst[j + 16], a1); + _mm512_store_ps(&dst[j + 32], a2); + _mm512_store_ps(&dst[j + 48], a3); + } + + for (size_t j = middle_start + ((middle_end - middle_start) / (simd_width * unroll_factor)) * (simd_width * + unroll_factor); + j < middle_end; ++j) + { + dst[j] = src[j]; + } + } + + if (fade_out_samples > 0 && num_samples > fade_out_samples) + { + const size_t fade_out_start = num_samples - fade_out_samples; + const float fade_out_step = 1.0f / static_cast(fade_out_samples); + + for (size_t j = fade_out_start; j + simd_width * unroll_factor <= num_samples; j += simd_width * + unroll_factor) + { + const size_t fade_out_offset = j - fade_out_start; + auto gain0 = _mm512_set_ps(1.0f - (fade_out_offset + 15) * fade_out_step, + 1.0f - (fade_out_offset + 14) * fade_out_step, + 1.0f - (fade_out_offset + 13) * fade_out_step, + 1.0f - (fade_out_offset + 12) * fade_out_step, + 1.0f - (fade_out_offset + 11) * fade_out_step, + 1.0f - (fade_out_offset + 10) * fade_out_step, + 1.0f - (fade_out_offset + 9) * fade_out_step, + 1.0f - (fade_out_offset + 8) * fade_out_step, + 1.0f - (fade_out_offset + 7) * fade_out_step, + 1.0f - (fade_out_offset + 6) * fade_out_step, + 1.0f - (fade_out_offset + 5) * fade_out_step, + 1.0f - (fade_out_offset + 4) * fade_out_step, + 1.0f - (fade_out_offset + 3) * fade_out_step, + 1.0f - (fade_out_offset + 2) * fade_out_step, + 1.0f - (fade_out_offset + 1) * fade_out_step, + 1.0f - fade_out_offset * fade_out_step); + auto gain1 = _mm512_set_ps(1.0f - (fade_out_offset + 31) * fade_out_step, + 1.0f - (fade_out_offset + 30) * fade_out_step, + 1.0f - (fade_out_offset + 29) * fade_out_step, + 1.0f - (fade_out_offset + 28) * fade_out_step, + 1.0f - (fade_out_offset + 27) * fade_out_step, + 1.0f - (fade_out_offset + 26) * fade_out_step, + 1.0f - (fade_out_offset + 25) * fade_out_step, + 1.0f - (fade_out_offset + 24) * fade_out_step, + 1.0f - (fade_out_offset + 23) * fade_out_step, + 1.0f - (fade_out_offset + 22) * fade_out_step, + 1.0f - (fade_out_offset + 21) * fade_out_step, + 1.0f - (fade_out_offset + 20) * fade_out_step, + 1.0f - (fade_out_offset + 19) * fade_out_step, + 1.0f - (fade_out_offset + 18) * fade_out_step, + 1.0f - (fade_out_offset + 17) * fade_out_step, + 1.0f - (fade_out_offset + 16) * fade_out_step); + auto gain2 = _mm512_set_ps(1.0f - (fade_out_offset + 47) * fade_out_step, + 1.0f - (fade_out_offset + 46) * fade_out_step, + 1.0f - (fade_out_offset + 45) * fade_out_step, + 1.0f - (fade_out_offset + 44) * fade_out_step, + 1.0f - (fade_out_offset + 43) * fade_out_step, + 1.0f - (fade_out_offset + 42) * fade_out_step, + 1.0f - (fade_out_offset + 41) * fade_out_step, + 1.0f - (fade_out_offset + 40) * fade_out_step, + 1.0f - (fade_out_offset + 39) * fade_out_step, + 1.0f - (fade_out_offset + 38) * fade_out_step, + 1.0f - (fade_out_offset + 37) * fade_out_step, + 1.0f - (fade_out_offset + 36) * fade_out_step, + 1.0f - (fade_out_offset + 35) * fade_out_step, + 1.0f - (fade_out_offset + 34) * fade_out_step, + 1.0f - (fade_out_offset + 33) * fade_out_step, + 1.0f - (fade_out_offset + 32) * fade_out_step); + auto gain3 = _mm512_set_ps(1.0f - (fade_out_offset + 63) * fade_out_step, + 1.0f - (fade_out_offset + 62) * fade_out_step, + 1.0f - (fade_out_offset + 61) * fade_out_step, + 1.0f - (fade_out_offset + 60) * fade_out_step, + 1.0f - (fade_out_offset + 59) * fade_out_step, + 1.0f - (fade_out_offset + 58) * fade_out_step, + 1.0f - (fade_out_offset + 57) * fade_out_step, + 1.0f - (fade_out_offset + 56) * fade_out_step, + 1.0f - (fade_out_offset + 55) * fade_out_step, + 1.0f - (fade_out_offset + 54) * fade_out_step, + 1.0f - (fade_out_offset + 53) * fade_out_step, + 1.0f - (fade_out_offset + 52) * fade_out_step, + 1.0f - (fade_out_offset + 51) * fade_out_step, + 1.0f - (fade_out_offset + 50) * fade_out_step, + 1.0f - (fade_out_offset + 49) * fade_out_step, + 1.0f - (fade_out_offset + 48) * fade_out_step); + + auto a0 = _mm512_load_ps(&src[j]); + auto a1 = _mm512_load_ps(&src[j + 16]); + auto a2 = _mm512_load_ps(&src[j + 32]); + auto a3 = _mm512_load_ps(&src[j + 48]); + + auto result0 = _mm512_mul_ps(a0, gain0); + auto result1 = _mm512_mul_ps(a1, gain1); + auto result2 = _mm512_mul_ps(a2, gain2); + auto result3 = _mm512_mul_ps(a3, gain3); + + _mm512_store_ps(&dst[j], result0); + _mm512_store_ps(&dst[j + 16], result1); + _mm512_store_ps(&dst[j + 32], result2); + _mm512_store_ps(&dst[j + 48], result3); + } + + for (size_t j = fade_out_start + ((fade_out_samples / (simd_width * unroll_factor)) * (simd_width * + unroll_factor)); + j < num_samples; ++j) + { + const size_t fade_out_offset = j - fade_out_start; + const float gain = 1.0f - static_cast(fade_out_offset) / static_cast(fade_out_samples); + dst[j] = src[j] * gain; + } + } + } + + SIMD_EXPORT void simple_eq(const float *src, float *dst, float low_gain, float mid_gain, float high_gain, + float *eq_state, size_t num_samples) + { + ASSERT_ALIGNED(src, ALIGNMENT_AVX512); + ASSERT_ALIGNED(dst, ALIGNMENT_AVX512); + + if (num_samples == 0) + { + return; + } + + constexpr size_t simd_width = 16; + constexpr size_t unroll_factor = 4; + + constexpr float low_cutoff = 0.02f; + constexpr float high_cutoff = 0.1f; + constexpr float mid_factor = 0.7f; + + float low_state = eq_state != nullptr ? *eq_state : 0.0f; + float high_state = eq_state != nullptr ? *(eq_state + 1) : 0.0f; + + const auto low_gain_vec = _mm512_set1_ps(low_gain); + const auto mid_gain_vec = _mm512_set1_ps(mid_gain); + const auto high_gain_vec = _mm512_set1_ps(high_gain); + const auto low_cutoff_vec = _mm512_set1_ps(low_cutoff); + const auto high_cutoff_vec = _mm512_set1_ps(high_cutoff); + const auto mid_factor_vec = _mm512_set1_ps(mid_factor); + const auto one_minus_low_cutoff_vec = _mm512_set1_ps(1.0f - low_cutoff); + const auto one_minus_high_cutoff_vec = _mm512_set1_ps(1.0f - high_cutoff); + + size_t i = 0; + + for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) + { + auto input0 = _mm512_load_ps(&src[i]); + auto input1 = _mm512_load_ps(&src[i + 16]); + auto input2 = _mm512_load_ps(&src[i + 32]); + auto input3 = _mm512_load_ps(&src[i + 48]); + + auto low_state_vec = _mm512_set1_ps(low_state); + auto low0 = _mm512_fmadd_ps(input0, low_cutoff_vec, _mm512_mul_ps(low_state_vec, one_minus_low_cutoff_vec)); + auto low1 = _mm512_fmadd_ps(input1, low_cutoff_vec, _mm512_mul_ps(low0, one_minus_low_cutoff_vec)); + auto low2 = _mm512_fmadd_ps(input2, low_cutoff_vec, _mm512_mul_ps(low1, one_minus_low_cutoff_vec)); + auto low3 = _mm512_fmadd_ps(input3, low_cutoff_vec, _mm512_mul_ps(low2, one_minus_low_cutoff_vec)); + + auto high0 = _mm512_sub_ps(input0, low0); + auto high1 = _mm512_sub_ps(input1, low1); + auto high2 = _mm512_sub_ps(input2, low2); + auto high3 = _mm512_sub_ps(input3, low3); + + auto high_state_vec = _mm512_set1_ps(high_state); + high0 = _mm512_fmadd_ps(high0, high_cutoff_vec, _mm512_mul_ps(high_state_vec, one_minus_high_cutoff_vec)); + high1 = _mm512_fmadd_ps(high1, high_cutoff_vec, _mm512_mul_ps(high0, one_minus_high_cutoff_vec)); + high2 = _mm512_fmadd_ps(high2, high_cutoff_vec, _mm512_mul_ps(high1, one_minus_high_cutoff_vec)); + high3 = _mm512_fmadd_ps(high3, high_cutoff_vec, _mm512_mul_ps(high2, one_minus_high_cutoff_vec)); + + auto mid0 = _mm512_mul_ps(_mm512_sub_ps(_mm512_sub_ps(input0, low0), high0), mid_factor_vec); + auto mid1 = _mm512_mul_ps(_mm512_sub_ps(_mm512_sub_ps(input1, low1), high1), mid_factor_vec); + auto mid2 = _mm512_mul_ps(_mm512_sub_ps(_mm512_sub_ps(input2, low2), high2), mid_factor_vec); + auto mid3 = _mm512_mul_ps(_mm512_sub_ps(_mm512_sub_ps(input3, low3), high3), mid_factor_vec); + + auto result0 = _mm512_fmadd_ps(low0, low_gain_vec, + _mm512_fmadd_ps(mid0, mid_gain_vec, _mm512_mul_ps(high0, high_gain_vec))); + auto result1 = _mm512_fmadd_ps(low1, low_gain_vec, + _mm512_fmadd_ps(mid1, mid_gain_vec, _mm512_mul_ps(high1, high_gain_vec))); + auto result2 = _mm512_fmadd_ps(low2, low_gain_vec, + _mm512_fmadd_ps(mid2, mid_gain_vec, _mm512_mul_ps(high2, high_gain_vec))); + auto result3 = _mm512_fmadd_ps(low3, low_gain_vec, + _mm512_fmadd_ps(mid3, mid_gain_vec, _mm512_mul_ps(high3, high_gain_vec))); + + _mm512_store_ps(&dst[i], result0); + _mm512_store_ps(&dst[i + 16], result1); + _mm512_store_ps(&dst[i + 32], result2); + _mm512_store_ps(&dst[i + 48], result3); + + __m128 low_temp = _mm512_extractf32x4_ps(low3, 3); + low_state = _mm_cvtss_f32(low_temp); + __m128 high_temp = _mm512_extractf32x4_ps(high3, 3); + high_state = _mm_cvtss_f32(high_temp); + } + + for (; i < num_samples; ++i) + { + float input = src[i]; + + float low_output = low_cutoff * input + (1.0f - low_cutoff) * low_state; + low_state = low_output; + + float high_input = input - low_output; + float high_output = high_cutoff * high_input + (1.0f - high_cutoff) * high_state; + high_state = high_output; + + float mid_output = (input - low_output - high_output) * mid_factor; + + dst[i] = low_output * low_gain + mid_output * mid_gain + high_output * high_gain; + } + + if (eq_state != nullptr) + { + *eq_state = low_state; + *(eq_state + 1) = high_state; + } + } +} \ No newline at end of file diff --git a/src/simd/simd_func_dispatcher.cpp b/src/simd/simd_func_dispatcher.cpp deleted file mode 100644 index 6ffcb89..0000000 --- a/src/simd/simd_func_dispatcher.cpp +++ /dev/null @@ -1,428 +0,0 @@ -/** - * @file simd_func_dispatcher.cpp - * @brief SIMD函数调度器实 - 运行时函数分发的核心实现 - * - * 本文件实现了SIMD数调度器的核心功能,包括: - * - 函数注册表状态查询和调试输出 - * - 版本枚举与字符串之间的双向转换 - * - 运行时函数版本选择的辅助功能 - * - * 核心实现功能: - * ============================================================================ - * 1. print_registry_status() - 调试诊断功 - * - 遍历并输出所有已注册的函数及其可用版本 - * - 用于运行时验证函数注册是否正确 - * - 帮助开发者了解当前可用的SIMD优化函数 - * - * 2. simd_func_version_to_string() - 版本到字符串转换 - * - 将枚举值转换为可读的字符串表示 - * - 用于日志输出、调试信息和用户界面显示 - * - 采用switch-case实现确保编译时类型安全 - * - * 3. string_to_simd_func_version() - 字符串到版本转换 - * - 将字符串解析为版本枚举值 - * - 用于配置文件解析、命令行参数处理 - * - 提供回退机制:无法识别时返回SCALAR版本 - * - * 实现特点: - * ============================================================================ - * - 简洁明了:实现直观,易于维护和扩展 - * - 类型安全:使用强类型枚举,避免魔法数字 - * - 完整映射:覆盖所有定义的SIMD版本 - * - 健壮性:处理未知版本的边界情况 - * - 零依赖:仅依赖标准库和项目头文件 - * - * 性能考虑: - * ============================================================================ - * - 版本转换函数使用简单的条件判断,性能开销可忽略 - * - print_registry_status() 仅用于调试,不在性能关键路径上 - * - 字符串比较采用高效的std::string相等性判断 - * - * 设计模式: - * ============================================================================ - * - 该文件是simd_func_dispatcher类的实现部分 - * - 采用了接口与实现分离的设计 - * - 头文件定义接口和模板实现,cpp文件实现非模板函数 - * - * @note 这些函数主要用于调试、日志和配置解析,不在性能关键路径上 - * @see simd_func_dispatcher.h 查看完整的类定义和接口说明 - */ - -#include "simd_func_dispatcher.h" -#include - -/** - * @brief 打印函数注册表状态 - 调试和诊断工具 - * - * 该函数遍历并打印所有已注册到调度器的SIMD函数及其可用版本, - * 是一个重要的调试和诊断工具。 - * - * 功能详述: - * ============================================================================ - * 1. 遍历函数注册表 - * - 访问func_registry_中每个函数名和持有者对 - * - 使用基于范围的for循环提高代码可读性 - * - * 2. 获取版本信息 - * - 调用holder的has_implementation()检查是否有实现 - * - 如有实现,调用get_available_versions()获取所有已注册版本 - * - * 3. 格式化输出 - * - 函数名:清晰标识当前函数 - * - 版本列表:展示所有可用的SIMD优化版本 - * - 未实现提示:明确标识未注册的函数 - * - * 输出格式示例: - * ============================================================================ - * ``` - * Registered SIMD Functions: - * Function: process_audio - * Available Versions: SCALAR SSE4 AVX2 - * Function: mix_channels - * Available Versions: SCALAR NEON - * Function: apply_gain - * No implementations registered. - * ``` - * - * 使用场景: - * ============================================================================ - * - 程序启动时验证函数注册是否正确 - * - 调试时检查特定函数的可用版本 - * - 性能分析时了解当前使用的优化级别 - * - 单元测试中验证注册逻辑 - * - * 性能考虑: - * ============================================================================ - * - 该函数仅用于调试,不应在性能关键代码中频繁调用 - * - 使用printf而非std::cout以减少头文件依赖和提高输出性能 - * - 遍历整个注册表的时间复杂度为O(n*m),n为函数数量,m为平均版本数 - * - * @note 这是一个const成员数,不会修改调度器状态 - * @note 输出直接发送到stdout,适合命令行程序使用 - */ -void simd_func_dispatcher::print_registry_status() const { - // 打印标题,标识输出内容 - printf("Registered SIMD Functions:\n"); - - // 遍历函数注册表中的所有条目 - // pair.first: 函数名称(std::string) - // pair.second: 函数持有者的unique_ptr(func_holder_base*) - for (const auto& pair : func_registry_) { - const auto& func_name = pair.first; // 获取函数名称 - const auto& holder = pair.second; // 获取函数持有者 - - // 输出当前函数名称 - printf("Function: %s\n", func_name.c_str()); - - // 检查该函数是否有任何版本的实现 - if (holder->has_implementation()) { - // 获取所有已注册的版本列表 - auto versions = holder->get_available_versions(); - - // 输出版本列表的标题 - printf(" Available Versions: "); - - // 遍历并输出每个可用版本 - // 使用switch-case将枚值转换为可读的字符串 - for (const auto& version : versions) { - switch (version) { - // x86/x64架构的SIMD版 - case simd_func_version::SCALAR: - printf("SCALAR "); // 标量版本(无SIMD化) - break; - case simd_func_version::SSE: - printf("SSE "); // SSE/SSE2版本(128位量) - break; - case simd_func_version::SSE3: - printf("SSE3 "); // SSE3/SSSE3版本 - break; - case simd_func_version::SSE4: - printf("SSE4 "); // SSE4.1/SSE4.2版本 - break; - case simd_func_version::AVX: - printf("AVX "); // AVX版本(256位向量) - break; - case simd_func_version::AVX2: - printf("AVX2 "); // AVX2+FMA版本 - break; - case simd_func_version::AVX512: - printf("AVX512 "); // AVX-512版本(512位量) - break; - - // ARM架构的SIMD版本 - case simd_func_version::NEON: - printf("NEON "); // ARM NEON版本 - break; - case simd_func_version::NEON_FP16: - printf("NEON_FP16 "); // NEON半精度浮点版 - break; - - // RISC-V架构的向量扩展 - case simd_func_version::VECTOR: - printf("VECTOR "); // RISC-V向量扩展 - break; - - // 处理未知版本(理论上不应出现) - default: - break; - } - } - // 版本列表输出完毕,换行 - printf("\n"); - } - else { - // 该函数尚未注册任何实现版本 - printf(" No implementations registered.\n"); - } - } -} - -/** - * @brief SIMD版本枚举转字串 - 将版本枚举值转换为可读字符串 - * @param version SIMD函数版本枚举 - * @return 对应的版本名称字符串(C风格字符串) - * - * 该函数提供版本枚举到字符串的标准转换,用于: - * - 日志输出和调试信息 - * - 用户界面显示 - * - 配置文件生成 - * - 错误消息构造 - * - * 实现策略: - * ============================================================================ - * 1. 使用switch-case实现完全映射 - * - 编译器可以检测遗漏的case分支 - * - 保证类型安全,避免隐式转换 - * - 性能优秀,通常编译为跳转表 - * - * 2. 返回C字符串而非std::string - * - 避免内存分配开销 - * - 字符串字面量存储在只读数据段 - * - 生命周期为整个程序运行期 - * - * 3. 提供默认处理 - * - 对于未识别的枚举值返回"UNKNOWN" - * - 提高代码健壮性,防止未定义行为 - * - * 映射关系: - * ============================================================================ - * | 枚举值 | 返回字符串 | 说明 | - * |-------------------|--------------|------------------------| - * | SCALAR | "SCALAR" | 标量实现 | - * | SSE | "SSE" | SSE/SSE2指令集 | - * | SSE3 | "SSE3" | SSE3/SSSE3指令集 | - * | SSE4 | "SSE4" | SSE4.1/4.2指令集 | - * | AVX | "AVX" | AVX指令集 | - * | AVX2 | "AVX2" | AVX2+FMA指令集 | - * | AVX512 | "AVX512" | AVX-512指令集 | - * | NEON | "NEON" | ARM NEON指令集 | - * | NEON_FP16 | "NEON_FP16" | NEON半精度浮点 | - * | VECTOR | "VECTOR" | RISC-V向量扩展 | - * | 其他 | "UNKNOWN" | 未知或非法值 | - * - * 使用示例: - * ============================================================================ - * @code - * // 日志输出 - * const char* name = simd_func_version_to_string(simd_func_version::AVX2); - * logger->info("Using SIMD version: {}", name); // 输出: Using SIMD version: AVX2 - * - * // 调试信息 - * printf("Current version: %s\n", simd_func_version_to_string(current_version)); - * - * // 配置文件生成 - * config_file << "preferred_version=" << simd_func_version_to_string(preferred) << "\n"; - * @endcode - * - * 性能特性: - * ============================================================================ - * - 时间复杂度:O(1) - 编译器通常优化为跳转表或二分查找 - * - 空间复杂度:O(1) - 字符串字面量存储在只读数据段 - * - 无内存分配:返回静态字符串,无运行时开销 - * - 线程安全:只读操作,无共享状态修改 - * - * @note 返回的字符串为静态存储,调用者不应修改或释放 - * @note constexpr修饰符未使用是因为字符串字面量返回类型的限制 - * @see string_to_simd_func_version() 执行反向换 - */ -const char* simd_func_version_to_string(simd_func_version version) { - // 使用switch-case实现全映射 - // 编译器会检测是否遗漏case分支(如果使用-Wswitch警告) - switch (version) { - // 标量版本 - 基础实现,所有平台都支持 - case simd_func_version::SCALAR: - return "SCALAR"; - - // x86/x64 SIMD指令集版本(发展顺序) - case simd_func_version::SSE: - return "SSE"; // 2003年:Pentium 4引入,128位向量 - case simd_func_version::SSE3: - return "SSE3"; // 2006年:Core微架构增强浮点运算 - case simd_func_version::SSE4: - return "SSE4"; // 2008年:Nehalem微架,增强整数和字符串处理 - case simd_func_version::AVX: - return "AVX"; // 2011年:Sandy Bridge,256位向量 - case simd_func_version::AVX2: - return "AVX2"; // 2013年:Haswell,完256位整数运算+FMA - case simd_func_version::AVX512: - return "AVX512"; // 2016年:Xeon Phi/Skylake-X,512位量 - - // ARM SIMD指令集版本 - case simd_func_version::NEON: - return "NEON"; // ARMv8-A标准,128位量 - case simd_func_version::NEON_FP16: - return "NEON_FP16"; // ARMv8.2-A,硬件半精度浮点支持 - - // RISC-V向量扩展 - case simd_func_version::VECTOR: - return "VECTOR"; // RISC-V V扩展,可变长度量 - - // 默认情况:处理未知或非法的枚举值 - // 这提供了一层安全保障,虽然理论上不应到达这里 - default: - break; - } - - // 返回未知版本标识 - // 这种情况可能发生在: - // 1. 枚举值被错误地强制转换 - // 2. 内存损坏导致枚举值异常 - // 3. 跨版本兼容性问题 - return "UNKNOWN"; -} - -/** - * @brief 字符串转SIMD版本举 - 将字符串解析为版本枚举值 - * @param version_str 版本名称字符串(如"AVX2"、"NEON"等) - * @return 对应的SIMD函数版枚举值 - * - * 该函数将字符串表示的版本名称转换为枚举值,用于: - * - 配置文件解析(读取用户指定的SIMD版本偏好) - * - 命令行参数处理(--simd-version=AVX2) - * - 环境变量解析(SIMD_VERSION=SSE4) - * - 测试和调试(手动指定测试版本) - * - * 实现策略: - * ============================================================================ - * 1. 线性字符串比较 - * - 按从高到低的版本顺序检查 - * - 使用std::string的相比较运算符 - * - 大小写敏感匹配 - * - * 2. 回退到SCALAR版本 - * - 无法识别的字符串返回最安全的SCALAR版本 - * - 保证函数永远返回有效值 - * - 避免程序因无效输入而崩溃 - * - * 3. 优化潜力 - * - 当前实现简单但足够高效(此函数不在热路径上) - * - 可能的优化:哈希表查找、Trie树、完美哈希 - * - 未优化原因:此函数主要在启动时调用,性能影响可忽略 - * - * 支持的字符串: - * ============================================================================ - * | 输入字符串 | 返回枚举值 | 备注 | - * |------------------|------------------------|---------------------| - * | "SCALAR" | simd_func_version::SCALAR | 标量实现 | - * | "SSE" | simd_func_version::SSE | SSE/SSE2 | - * | "SSE3" | simd_func_version::SSE3 | SSE3/SSSE3 | - * | "SSE4" | simd_func_version::SSE4 | SSE4.1/4.2 | - * | "AVX" | simd_func_version::AVX | AVX指令集 | - * | "AVX2" | simd_func_version::AVX2 | AVX2+FMA | - * | "AVX512" | simd_func_version::AVX512 | AVX-512 | - * | "NEON" | simd_func_version::NEON | ARM NEON | - * | "NEON_FP16" | simd_func_version::NEON_FP16 | NEON半精度 | - * | "VECTOR" | simd_func_version::VECTOR | RISC-V向量 | - * | 其他任何字符串 | simd_func_version::SCALAR | 默认回退 | - * - * 使用示例: - * ============================================================================ - * @code - * // 配置文件解析 - * std::string config_version = config["simd_version"]; - * auto version = string_to_simd_func_version(config_version); - * - * // 命令行参数处理 - * if (argc > 1) { - * auto preferred = string_to_simd_func_version(argv[1]); - * // 使用preferred版本... - * } - * - * // 环境变量解析 - * const char* env_version = std::getenv("SIMD_VERSION"); - * if (env_version) { - * auto version = string_to_simd_func_version(env_version); - * } - * - * // 测试中指定版本 - * auto test_version = string_to_simd_func_version("AVX2"); - * test_with_version(test_version); - * @endcode - * - * 设计考虑: - * ============================================================================ - * 1. 大小写敏感 - * - 当前实现要求精确匹配("AVX2"而非"avx2") - * - 原因:保持一致性,避免混淆 - * - 改进:可添加大小写不敏感版本或预处理转换 - * - * 2. 错误处理策略 - * - 采用"宽容"策略无效输入返回SCALAR而非抛异常 - * - 优点:避免程序崩溃,提供最低限度的功能 - * - 缺点:可能掩盖配置错误 - * - 建议:调用者应验证输入或记录回退日志 - * - * 3. 性能特性 - * - 最坏情况:O(n) - n为版本数量(约10次字符串比较) - * - 平均情况:取决于输入分布 - * - 优化可能:哈希表O(1),但增加复杂度 - * - 当前选择:简单性优先(此函数不在性能关键路径) - * - * @note 字符串比较是大小写敏感的 - * @note 无法识别的字符串会回退到SCALAR版本,不会抛出异常 - * @note 建议在调用后验证返回值是否符合预期 - * @see simd_func_version_to_string() 执行反向转换 - */ -simd_func_version string_to_simd_func_version(const std::string& version_str) { - // 按版本由高到低的顺序进行检查 - // 这样的顺序有助于代码可读性,但对性能无实质影响 - - // 标量版本检查 - if (version_str == "SCALAR") - return simd_func_version::SCALAR; - - // x86/x64 SIMD版本检查(按令集发展顺序) - if (version_str == "SSE") - return simd_func_version::SSE; - if (version_str == "SSE3") - return simd_func_version::SSE3; - if (version_str == "SSE4") - return simd_func_version::SSE4; - if (version_str == "AVX") - return simd_func_version::AVX; - if (version_str == "AVX2") - return simd_func_version::AVX2; - if (version_str == "AVX512") - return simd_func_version::AVX512; - - // ARM SIMD版本检查 - if (version_str == "NEON") - return simd_func_version::NEON; - if (version_str == "NEON_FP16") - return simd_func_version::NEON_FP16; - - // RISC-V向量扩展检查 - if (version_str == "VECTOR") - return simd_func_version::VECTOR; - - // 无法识别的字符串:回退到SCALAR版本 - // 这提供了最基本的兼容性保证 - // 可能的原因: - // - 拼写错误:"avx2"小写)、"AVX 2"(有空格) - // - 不支持的版本名称:"AVX10"、"SSE5" - // - 空字符串或格式错误的输入 - // - // 注意:这里静默回退可能掩盖配置错误 - // 建议:调用者在关键场景应添加日志记录 - return simd_func_version::SCALAR; // 默认返回SCALAR -} diff --git a/src/simd/simd_func_dispatcher.h b/src/simd/simd_func_dispatcher.h deleted file mode 100644 index 4aec624..0000000 --- a/src/simd/simd_func_dispatcher.h +++ /dev/null @@ -1,593 +0,0 @@ -/** - * @file simd_func_dispatcher.h - * @brief SIMD函数调度器 - 运行时选择最优SIMD实现的核心框架 - * - * 本模块提供了一个强大而灵活的SIMD函数调度系统,能够: - * - 在运行时根据CPU特性自动选择最优的SIMD实现 - * - 支持多版本函数(标量、SSE、AVX、NEON等)的统一管理 - * - 提供类型安全的函数注册和调用机制 - * - 自动回退到兼容性更好的实现版本 - * - * 核心设计思想: - * 1. 多版本实现:每个函数可以有多个针对不同SIMD级别的优化版本 - * 2. 运行时选择:程序启动时检测CPU特性,选择最佳版本 - * 3. 透明调用:用户调用时无需关心具体使用哪个版本 - * 4. 类型安全:使用模板和std::function确保类型匹配 - * - * 工作流程: - * ``` - * [注册阶段] - * 1. 为每个函数注册多个SIMD版本的实现 - * 2. 调度器存储所有版本并根据CPU能力选择最优版本 - * - * [调用阶段] - * 3. 用户调用函数时,调度器自动使用预选的最优版本 - * 4. 如果最优版本不可用,自动回退到次优版本 - * ``` - * - * 使用示例: - * @code - * // 注册函数的多个版本 - * REGISTER_SIMD_FUNCTION("process_audio", simd_func_version::SCALAR, scalar_impl); - * REGISTER_SIMD_FUNCTION("process_audio", simd_func_version::AVX2, avx2_impl); - * - * // 获取并调用最优版本 - * auto& func = GET_SIMD_FUNCTION(void(float*, size_t), "process_audio"); - * func(data, count); // 自动使用AVX2版本(如果CPU支持) - * @endcode - * - * @note 这是整个SIMD优化框架的核心组件 - * @see cpu_feature_detector, multi_version_func - */ - -#pragma once -#include -#include -#include -#include -#include -#include - -#include "cpu_features.h" - -/** - * @enum simd_func_version - * @brief SIMD函数版本枚举 - 定义所有可能的函数实现版本 - * - * 该枚举定义了函数可以有的所有SIMD优化版本。 - * 每个版本对应一个特定的SIMD指令集级别。 - * - * 版本排序: - * - 枚举值从低到高表示性能从弱到强 - * - SCALAR是最基础的版本,所有CPU都支持 - * - COUNT用于数组大小,不是实际版本 - * - * 与simd_level的关系: - * - simd_level表示CPU的能力级别 - * - simd_func_version表示函数的实现版本 - * - 通过simd_level_to_version()进行转换 - * - * @note 不是所有函数都需要实现所有版本 - * @see simd_level, simd_level_to_version() - */ -enum class simd_func_version { - /** 标量实现 - 纯C++代码,无SIMD优化 - * - 兼容性:所有CPU - * - 性能:基准性能(1x) - * - 用途:最低保底实现、参考实现 - * - 必要性:强制要求,作为回退版本 - */ - SCALAR = 0, - - /** SSE实现 - 使用SSE/SSE2指令 - * - 兼容性:2003年后的所有x86/x64 - * - 向量宽度:128位 - * - 性能提升:约2-4倍 - */ - SSE, - - /** SSE3实现 - 使用SSE3/SSSE3指令 - * - 兼容性:2006年后的主流CPU - * - 新增功能:水平运算、复数支持 - * - 性能提升:比SSE快10-20% - */ - SSE3, - - /** SSE4实现 - 使用SSE4.1/SSE4.2指令 - * - 兼容性:2008年后的主流CPU - * - 新增功能:点积、blend、字符串处理 - * - 性能提升:比SSE3快15-30% - */ - SSE4, - - /** AVX实现 - 使用AVX指令 - * - 兼容性:2011年后的主流CPU - * - 向量宽度:256位 - * - 性能提升:约2倍SSE4性能 - */ - AVX, - - /** AVX2实现 - 使用AVX2 + FMA指令 - * - 兼容性:2013年后的主流CPU - * - 新增功能:完整256位整数运算、FMA - * - 性能提升:比AVX快50-100% - * - 推荐:当前最佳性能/兼容性平衡点 - */ - AVX2, - - /** AVX-512实现 - 使用AVX-512指令集 - * - 兼容性:2016年后的高端CPU - * - 向量宽度:512位 - * - 性能提升:约2倍AVX2性能(理论) - * - 注意:可能导致CPU降频 - */ - AVX512, - - /** NEON实现 - 使用ARM NEON指令 - * - 兼容性:所有ARMv8-A (64位ARM) - * - 向量宽度:128位 - * - 性能:与SSE4相当 - * - 应用:移动设备、Apple Silicon - */ - NEON, - - /** NEON + FP16实现 - 使用NEON半精度浮点 - * - 兼容性:ARMv8.2-A及更新 - * - 新增:硬件FP16运算 - * - 性能:FP16运算快2倍 - * - 应用:移动端AI推理 - */ - NEON_FP16, - - /** RISC-V向量扩展实现 - * - 兼容性:支持RVV的RISC-V处理器 - * - 特点:可变向量长度 - * - 应用:嵌入式、IoT - */ - VECTOR, - - /** 版本数量标记 - * 用于数组大小定义,不是实际的函数版本 - */ - COUNT -}; - -/** - * @brief 将SIMD级别转换为函数版本 - * @param level CPU的SIMD级别 - * @return 对应的函数版本枚举值 - * - * 将cpu_feature_detector检测到的SIMD级别转换为 - * 函数调度器使用的版本标识。 - * - * 映射关系: - * - simd_level::NONE -> simd_func_version::SCALAR - * - simd_level::SSE -> simd_func_version::SSE - * - simd_level::AVX2 -> simd_func_version::AVX2 - * - 等等... - * - * @note constexpr函数,编译时求值,零运行时开销 - * @see simd_level, simd_func_version - */ -constexpr auto simd_level_to_version(simd_level level) { - switch (level) { - case simd_level::NONE: - return simd_func_version::SCALAR; - case simd_level::SSE: - return simd_func_version::SSE; - case simd_level::SSE3: - return simd_func_version::SSE3; - case simd_level::SSE4: - return simd_func_version::SSE4; - case simd_level::AVX: - return simd_func_version::AVX; - case simd_level::AVX2: - return simd_func_version::AVX2; - case simd_level::AVX512: - return simd_func_version::AVX512; - case simd_level::NEON: - return simd_func_version::NEON; - case simd_level::NEON_FP16: - return simd_func_version::NEON_FP16; - } - - // 默认回退到标量版本 - return simd_func_version::SCALAR; -} - -// 前向声明 -template -class multi_version_func; - -/** - * @class multi_version_func - * @brief 多版本函数容器 - 管理同一函数的多个SIMD优化版本 - * @tparam return_type 函数返回类型 - * @tparam args 函数参数类型列表 - * - * 该类模板存储和管理一个函数的所有SIMD版本实现, - * 并能根据CPU能力自动选择最优版本。 - * - * 核心功能: - * 1. 存储多个版本:为每个SIMD级别存储一个函数实现 - * 2. 自动选择:根据CPU能力选择最优可用版本 - * 3. 智能回退:如果最优版本不可用,自动使用次优版本 - * 4. 类型安全:使用std::function确保所有版本签名一致 - * - * 选择策略: - * - 获取推荐的SIMD级别(考虑性能和兼容性) - * - 从推荐级别开始,向下查找第一个可用的实现 - * - 如果都不可用,抛出异常 - * - * 使用示例: - * @code - * multi_version_func func; - * func.register_version(simd_func_version::SCALAR, scalar_impl); - * func.register_version(simd_func_version::AVX2, avx2_impl); - * - * // 自动选择最优版本并调用 - * func(data, count); - * @endcode - * - * @note 通常不直接使用,而是通过simd_func_dispatcher管理 - * @see simd_func_dispatcher - */ -template -class multi_version_func { -public: - /// 函数类型定义 - using func_type = std::function; - /// 函数数组类型(存储所有版本) - using func_arr = std::array(simd_func_version::COUNT)>; - - /** - * @brief 默认构造函数 - * - * 创建空的多版本函数容器,所有版本槽位初始化为nullptr - */ - multi_version_func() = default; - - /** - * @brief 注册函数的特定版本实现 - * @param version SIMD版本标识 - * @param func 该版本的函数实现 - * - * 将一个函数实现注册到指定的SIMD版本槽位。 - * 如果该槽位已有实现,会被覆盖。 - * 注册后会自动更新最佳函数选择。 - * - * @note 使用std::move避免不必要的拷贝 - */ - void register_version(simd_func_version version, func_type func) { - functions_[static_cast(version)] = std::move(func); - best_func_ = get_best_func(); // 更新最佳函数 - } - - /** - * @brief 获取当前最佳的函数实现 - * @return 最优函数的引用 - * - * 根据以下策略选择最佳函数: - * 1. 获取推荐的SIMD级别(考虑CPU特性和性能) - * 2. 转换为函数版本枚举 - * 3. 从推荐版本开始向下查找第一个可用的实现 - * 4. 如果都不可用,返回空函数指针 - * - * 回退顺序示例(假设推荐AVX2): - * AVX2 -> AVX -> SSE4 -> SSE3 -> SSE -> SCALAR - * - * @note 该函数会被缓存到best_func_成员,避免重复查找 - */ - const auto& get_best_func() const { - const auto recommended_level = get_recommended_simd_level(); - const auto referred_version = simd_level_to_version(recommended_level); - - // 从首选版本开始,向下查找可用的实现 - for (int v = static_cast(referred_version); v >= 0; --v) { - auto version = static_cast(v); - if (const auto& func = functions_[static_cast(version)]) { return func; } - } - - // 如果没有找到任何实现,返回一个空函数 - static const func_type empty_func = nullptr; - return empty_func; - } - - /** - * @brief 函数调用运算符 - 执行最优版本的函数 - * @param in_args 转发给函数的参数 - * @return 函数执行结果 - * @throws std::runtime_error 如果没有可用的实现 - * - * 自动选择并调用最佳版本的函数实现。 - * 参数会被完美转发到实际的函数。 - * - * @note 这使得multi_version_func对象可以像普通函数一样调用 - */ - auto operator()(args... in_args) const { - if (!best_func_) { - throw std::runtime_error("没有可用的SIMD实现。"); - } - return best_func_(std::forward(in_args)...); - } - - /** - * @brief 检查是否有任何版本的实现 - * @return true表示至少有一个版本已注册 - * - * 用于验证函数是否已正确注册。 - */ - auto has_implementation() const { - return std::any_of(functions_.begin(), functions_.end(), [](const auto& func) { return func != nullptr; }); - } - - /** - * @brief 获取所有已注册版本的列表 - * @return 已注册的版本枚举值向量 - * - * 用于调试和状态查询,列出该函数有哪些版本的实现。 - * - * 示例输出:[SCALAR, SSE, AVX2] - */ - auto get_available_versions() const { - std::vector available_versions; - for (size_t i = 0; i < functions_.size(); ++i) { - if (functions_[i]) { available_versions.push_back(static_cast(i)); } - } - return available_versions; - } - -private: - func_arr functions_{}; ///< 所有版本的函数数组 - func_type best_func_{nullptr}; ///< 缓存的最佳函数(性能优化) -}; - -/** - * @class simd_func_dispatcher - * @brief SIMD函数调度器 - 全局函数注册和调度中心 - * - * 这是整个SIMD优化框架的核心类,采用单例模式管理所有SIMD优化函数。 - * - * 主要职责: - * 1. 函数注册:接受多版本函数的注册 - * 2. 函数存储:使用类型擦除技术统一管理不同签名的函数 - * 3. 函数查询:根据名称和签名获取最优版本的函数 - * 4. 函数调用:提供便捷的调用接口 - * 5. 状态查询:列出所有已注册的函数及其版本 - * - * 设计特点: - * - 单例模式:全局唯一实例,集中管理 - * - 类型安全:模板确保函数签名匹配 - * - 类型擦除:不同签名的函数可以存储在同一容器中 - * - 延迟绑定:运行时根据CPU特性选择最优版本 - * - * 典型工作流程: - * ``` - * [初始化阶段] - * 1. 程序启动时,各模块注册自己的SIMD函数 - * REGISTER_SIMD_FUNCTION("mix_audio", SCALAR, scalar_mix); - * REGISTER_SIMD_FUNCTION("mix_audio", AVX2, avx2_mix); - * - * [运行阶段] - * 2. 代码中获取并调用函数 - * auto& mix = GET_SIMD_FUNCTION(void(float*, float*, float*, size_t), "mix_audio"); - * mix(src1, src2, dst, count); // 自动使用AVX2版本 - * ``` - * - * @note 通常通过宏REGISTER_SIMD_FUNCTION、GET_SIMD_FUNCTION使用 - * @see multi_version_func, lazy_singleton - */ -class simd_func_dispatcher : public lazy_singleton { -public: - friend class lazy_singleton; - - /** - * @brief 注册函数的特定版本实现 - * @tparam func_signature 函数签名类型(如void(float*, size_t)) - * @param func_name 函数名称(字符串标识) - * @param version SIMD版本标识 - * @param func 该版本的函数实现 - * - * 将一个函数的特定SIMD版本注册到调度器。 - * 如果该函数名第一次出现,会自动创建多版本函数容器。 - * 如果该版本已存在,会被新实现覆盖。 - * - * @note 推荐使用REGISTER_SIMD_FUNCTION宏而不是直接调用 - */ - template - void register_function(const std::string& func_name, - simd_func_version version, - std::function func) { - auto& holder = get_or_create_func(func_name); - holder.register_version(version, std::move(func)); - } - - /** - * @brief 获取函数的最优版本 - * @tparam func_signature 函数签名类型 - * @param func_name 函数名称 - * @return 多版本函数对象的引用 - * @throws std::runtime_error 如果函数未注册 - * - * @note 推荐使用GET_SIMD_FUNCTION宏 - */ - template - const auto& get_function(const std::string& func_name) const { - const auto& it = func_registry_.find(func_name); - if (it == func_registry_.end()) { - throw std::runtime_error("函数 '" + func_name + "' 未注册"); - } - - auto* holder = static_cast*>(it->second.get()); - return holder->func; - } - - /** - * @brief 直接调用函数(便捷接口) - * @tparam func_signature 函数签名类型 - * @tparam args 参数类型包 - * @param func_name 函数名称 - * @param in_args 转发给函数的参数 - * @return 函数执行结果 - * @throws std::runtime_error 如果函数未注册或无可用实现 - * - * @note 推荐使用CALL_SIMD_FUNCTION宏 - */ - template - auto call_function(const std::string& func_name, args&&... in_args) const { - const auto& func = get_function(func_name); - return func(std::forward(in_args)...); - } - - /** - * @brief 列出所有已注册的函数名称 - * @return 函数名称列表 - */ - [[nodiscard]] auto list_functions() const -> std::vector { - std::vector func_names; - for (const auto& pair : func_registry_) { func_names.push_back(pair.first); } - return func_names; - } - - /** - * @brief 打印所有函数的注册状态 - * @see simd_func_dispatcher.cpp 实现在cpp文件中 - */ - void print_registry_status() const; - -private: - /** 函数持有者基类 - 类型擦除的基础 */ - struct func_holder_base { - virtual ~func_holder_base() = default; - [[nodiscard]] virtual auto get_available_versions() const -> std::vector = 0; - [[nodiscard]] virtual auto has_implementation() const -> bool = 0; - }; - - /** 具体的函数持有者模板 */ - template - struct func_holder : func_holder_base { - multi_version_func func; - [[nodiscard]] auto get_available_versions() const -> std::vector override { - return func.get_available_versions(); - } - [[nodiscard]] auto has_implementation() const -> bool override { return func.has_implementation(); } - }; - - /** 获取或创建函数持有者(内部辅助函数) */ - template - auto& get_or_create_func(const std::string& func_name) { - const auto& it = func_registry_.find(func_name); - if (it != func_registry_.end()) { - auto* holder = static_cast*>(it->second.get()); - return holder->func; - } - auto holder = std::make_unique>(); - auto* ptr = holder.get(); - func_registry_[func_name] = std::move(holder); - return ptr->func; - } - - /// 函数注册表:函数名 -> 函数持有者的映射 - std::unordered_map> func_registry_{}; -}; - -/** - * @def REGISTER_SIMD_FUNCTION - * @brief 注册SIMD函数宏 - 便捷的函数注册接口 - * @param func_name 函数名称(字符串字面量) - * @param version SIMD版本枚举值 - * @param func 函数指针或可调用对象 - * - * 简化函数注册的便捷宏,自动推导函数签名并注册。 - * - * 示例: - * @code - * REGISTER_SIMD_FUNCTION("process", simd_func_version::SCALAR, scalar_process); - * REGISTER_SIMD_FUNCTION("process", simd_func_version::AVX2, avx2_process); - * @endcode - */ -#define REGISTER_SIMD_FUNCTION(func_name, version, func) \ - simd_func_dispatcher::instance().register_function(func_name, version, std::function(func)); - -/** - * @def GET_SIMD_FUNCTION - * @brief 获取SIMD函数宏 - 便捷的函数获取接口 - * @param func_signature 函数签名类型 - * @param func_name 函数名称 - * @return 多版本函数对象的引用 - * - * 示例: - * @code - * auto& process = GET_SIMD_FUNCTION(void(float*, size_t), "process"); - * process(data, count); - * @endcode - */ -#define GET_SIMD_FUNCTION(func_signature, func_name) \ - simd_func_dispatcher::instance().get_function(func_name); - -/** - * @def CALL_SIMD_FUNCTION - * @brief 调用SIMD函数宏 - 便捷的函数调用接口 - * @param func_signature 函数签名类型 - * @param func_name 函数名称 - * @param ... 函数参数 - * - * 示例: - * @code - * CALL_SIMD_FUNCTION(void(float*, size_t), "process", data, count); - * @endcode - */ -#define CALL_SIMD_FUNCTION(func_signature, func_name, ...) \ - simd_func_dispatcher::instance().call_function(func_name, __VA_ARGS__); - -/** - * @class simd_auto_register - * @brief SIMD自动注册助手 - 利用静态初始化自动注册函数 - * @tparam func_signature 函数签名类型 - * - * 该类利用C++的静态初始化机制,在程序启动时自动注册函数。 - * 通常不直接使用,而是通过AUTO_REGISTER_SIMD_FUNCTION宏。 - * - * @see AUTO_REGISTER_SIMD_FUNCTION - */ -template -class simd_auto_register { -public: - simd_auto_register(const std::string& func_name, simd_func_version version, std::function func) { - simd_func_dispatcher::instance().register_function(func_name, version, std::move(func)); - } -}; - -/** - * @brief 将SIMD函数版本枚举转换为字符串 - * @param version 函数版本枚举值 - * @return 本名称字符串 - * - * 用于调试输出和日志记录。 - * - * 示例: - * @code - * const char* name = simd_func_version_to_string(simd_func_version::AVX2); - * // name = "AVX2" - * @endcode - * - * @see simd_func_dispatcher.cpp - */ -const char* simd_func_version_to_string(simd_func_version version); - -/** - * @brief 将字符串转换为SIMD函数版本枚举 - * @param version_str 版本名称字符串 - * @return 对应的函数版本枚举值 - * - * 字符串不匹配时返回simd_func_version::SCALAR。 - * - * 示例: - * @code - * auto version = string_to_simd_func_version("AVX2"); - * // version = simd_func_version::AVX2 - * @endcode - * - * @see simd_func_dispatcher.cpp - */ -simd_func_version string_to_simd_func_version(const std::string& version_str); diff --git a/src/simd/simd_interface/CMakeLists.txt b/src/simd/simd_interface/CMakeLists.txt new file mode 100644 index 0000000..a9ff027 --- /dev/null +++ b/src/simd/simd_interface/CMakeLists.txt @@ -0,0 +1,4 @@ +project(alicho_simd_interface) + +simple_library(INTERFACE) +target_compile_definitions(${PROJECT_NAME} INTERFACE SIMD_INTERFACE_EXPORTS) \ No newline at end of file diff --git a/src/simd/aligned_allocator.h b/src/simd/simd_interface/aligned_allocator.h similarity index 98% rename from src/simd/aligned_allocator.h rename to src/simd/simd_interface/aligned_allocator.h index 80ae769..cc55a1c 100644 --- a/src/simd/aligned_allocator.h +++ b/src/simd/simd_interface/aligned_allocator.h @@ -29,6 +29,7 @@ #include #include #include // ASSERT_ALIGNED宏需要 +#include /** * @defgroup alignment_constants 对齐常量定义 @@ -385,29 +386,23 @@ bool operator!=(const aligned_allocator&, const aligned_allocator -using sse_aligned_allocator = aligned_allocator; +using sse_aligned_allocator = aligned_allocator; /** AVX对齐分配器 (32字节对齐) * 适用于AVX/AVX2指令优化的代码 - * @tparam T 元素类型 * * 示例: * @code * std::vector> data(1024); * @endcode */ -template -using avx_aligned_allocator = aligned_allocator; +using avx_aligned_allocator = aligned_allocator; /** AVX-512对齐分配器 (64字节对齐) * 适用于AVX-512指令优化的代码 - * @tparam T 元素类型 */ -template -using avx512_aligned_allocator = aligned_allocator; +using avx512_aligned_allocator = aligned_allocator; /** 缓存行对齐分配器 (64字节对齐) * 用于避免false sharing,优化多线程性能 diff --git a/src/simd/simd_interface/simd_export.h b/src/simd/simd_interface/simd_export.h new file mode 100644 index 0000000..10804fd --- /dev/null +++ b/src/simd/simd_interface/simd_export.h @@ -0,0 +1,17 @@ +#pragma once + +#if defined(_MSC_VER) + #ifdef SIMD_INTERFACE_EXPORTS + #define SIMD_EXPORT __declspec(dllexport) + #else + #define SIMD_EXPORT __declspec(dllimport) + #endif +#elif defined(__GNUC__) + #ifdef SIMD_INTERFACE_EXPORTS + #define SIMD_EXPORT __attribute__((visibility("default"))) + #else + #define SIMD_EXPORT + #endif +#else + #define SIMD_EXPORT +#endif \ No newline at end of file diff --git a/src/simd/simd_interface/simd_interface.cpp b/src/simd/simd_interface/simd_interface.cpp new file mode 100644 index 0000000..e0c2349 --- /dev/null +++ b/src/simd/simd_interface/simd_interface.cpp @@ -0,0 +1 @@ +#include "simd_interface.h" diff --git a/src/simd/simd_interface/simd_interface.h b/src/simd/simd_interface/simd_interface.h new file mode 100644 index 0000000..04b3d1f --- /dev/null +++ b/src/simd/simd_interface/simd_interface.h @@ -0,0 +1,17 @@ +#pragma once + +#include "simd_export.h" +#include + +extern "C" { +SIMD_EXPORT void fill_buffer(float* buffer, float value, size_t num_samples); +SIMD_EXPORT void mix_audio(const float* src1, const float* src2, float* dst, size_t num_samples); +SIMD_EXPORT void apply_gain(const float* src, float* dst, float gain, size_t num_samples); +SIMD_EXPORT float calculate_rms(const float* src, size_t num_samples); +SIMD_EXPORT float calculate_peak(const float* src, size_t num_samples); +SIMD_EXPORT void normalize_audio(const float* src, float* dst, float target_peak, size_t num_samples); +SIMD_EXPORT void stereo_to_mono(const float* stereo_src, float* mono_dst, size_t num_stereo_samples); +SIMD_EXPORT void limit_audio(const float* src, float* dst, float threshold, float* limiter_state, float sample_rate, size_t num_samples); +SIMD_EXPORT void fade_audio(const float* src, float* dst, size_t fade_in_samples, size_t fade_out_samples, size_t num_samples); +SIMD_EXPORT void simple_eq(const float* src, float* dst, float low_gain, float mid_gain, float high_gain, float* eq_state, size_t num_samples); +} \ No newline at end of file diff --git a/src/simd/simd_scaler/CMakeLists.txt b/src/simd/simd_scaler/CMakeLists.txt new file mode 100644 index 0000000..afc4f2b --- /dev/null +++ b/src/simd/simd_scaler/CMakeLists.txt @@ -0,0 +1,4 @@ +project(alicho_simd_scaler) + +simple_library(SHARED) +target_link_libraries(${PROJECT_NAME} PUBLIC alicho_simd_interface) diff --git a/src/simd/simd_scaler/simd_func.cpp b/src/simd/simd_scaler/simd_func.cpp new file mode 100644 index 0000000..662cc67 --- /dev/null +++ b/src/simd/simd_scaler/simd_func.cpp @@ -0,0 +1,179 @@ +#include "simd_interface.h" + +#include + +extern "C" +{ + SIMD_EXPORT void fill_buffer(float *buffer, float value, size_t num_samples) + { + for (size_t i = 0; i < num_samples; ++i) + { + buffer[i] = value; + } + } + + SIMD_EXPORT void mix_audio(const float *src1, const float *src2, float *dst, size_t num_samples) + { + for (size_t i = 0; i < num_samples; ++i) + { + dst[i] = src1[i] + src2[i]; + } + } + + SIMD_EXPORT void apply_gain(const float *src, float *dst, float gain, size_t num_samples) + { + for (size_t i = 0; i < num_samples; ++i) + { + dst[i] = src[i] * gain; + } + } + + SIMD_EXPORT float calculate_rms(const float *src, size_t num_samples) + { + float sum_squares = 0.0f; + for (size_t i = 0; i < num_samples; ++i) + { + sum_squares += src[i] * src[i]; + } + return std::sqrt(sum_squares / static_cast(num_samples)); + } + + SIMD_EXPORT float calculate_peak(const float *src, size_t num_samples) + { + float peak = 0.0f; + for (size_t i = 0; i < num_samples; ++i) + { + float abs_sample = std::fabs(src[i]); + if (abs_sample > peak) + { + peak = abs_sample; + } + } + return peak; + } + + SIMD_EXPORT void normalize_audio(const float *src, float *dst, float target_peak, size_t num_samples) + { + if (num_samples == 0 || target_peak <= 0.0f) + { + return; + } + + const float current_peak = calculate_peak(src, num_samples); + + if (current_peak < 1e-10f) + { + for (size_t i = 0; i < num_samples; ++i) + { + dst[i] = 0.0f; + } + return; + } + + const float gain_factor = target_peak / current_peak; + apply_gain(src, dst, gain_factor, num_samples); + } + + SIMD_EXPORT void stereo_to_mono(const float *stereo_src, float *mono_dst, size_t num_stereo_samples) + { + for (size_t i = 0; i < num_stereo_samples; i += 2) + { + mono_dst[i / 2] = (stereo_src[i] + stereo_src[i + 1]) * 0.5f; + } + } + + SIMD_EXPORT void limit_audio(const float *src, float *dst, float threshold, float *limiter_state, float sample_rate, + size_t num_samples) + { + if (num_samples == 0 || threshold <= 0.0f) + { + return; + } + constexpr float release_time = 0.05f; + float release_coeff = std::exp(-1.0f / (release_time * sample_rate)); + float current_gain = limiter_state != nullptr ? *limiter_state : 1.0f; + for (size_t i = 0; i < num_samples; ++i) + { + float sample = src[i]; + float abs_sample = std::fabs(sample); + + float target_gain = abs_sample > threshold ? threshold / abs_sample : 1.0f; + + if (target_gain < current_gain) + { + current_gain = target_gain; + } + else + { + current_gain = target_gain + (current_gain - target_gain) * release_coeff; + } + + dst[i] = sample * current_gain; + } + } + + SIMD_EXPORT void fade_audio(const float *src, float *dst, size_t fade_in_samples, size_t fade_out_samples, + size_t num_samples) + { + if (num_samples == 0) + { + return; + } + size_t i = 0; + if (fade_in_samples > 0) + { + const float fade_in_step = 1.0f / static_cast(fade_in_samples); + for (; i < std::min(fade_in_samples, num_samples); ++i) + { + const float gain = static_cast(i) * fade_in_step; + dst[i] = src[i] * gain; + } + } + if (fade_out_samples > 0) + { + const size_t fade_out_start = num_samples > fade_out_samples ? num_samples - fade_out_samples : 0; + const float fade_out_step = 1.0f / static_cast(fade_out_samples); + for (size_t j = fade_out_start; j < num_samples; ++j) + { + const size_t fade_out_offset = j - fade_out_start; + const float gain = 1.0f - static_cast(fade_out_offset) * fade_out_step; + dst[j] = src[j] * gain; + } + } + } + + SIMD_EXPORT void simple_eq(const float *src, float *dst, float low_gain, float mid_gain, float high_gain, float *eq_state, size_t num_samples) + { + if (num_samples == 0) + { + return; + } + + float low_pass_state = eq_state != nullptr ? eq_state[0] : 0.0f; + float high_pass_state = eq_state != nullptr ? eq_state[1] : 0.0f; + const float low_cutoff_freq = 200.0f; + const float high_cutoff_freq = 2000.0f; + const float sample_rate = 44100.0f; + const float low_alpha = low_cutoff_freq / (low_cutoff_freq + sample_rate); + const float high_alpha = sample_rate / (high_cutoff_freq + sample_rate); + for (size_t i = 0; i < num_samples; ++i) + { + float sample = src[i]; + + low_pass_state += low_alpha * (sample - low_pass_state); + float low_freq = low_pass_state; + + high_pass_state = high_alpha * (high_pass_state + sample - (i > 0 ? src[i - 1] : 0.0f)); + float high_freq = high_pass_state; + + float mid_freq = sample - low_freq - high_freq; + + dst[i] = low_freq * low_gain + mid_freq * mid_gain + high_freq * high_gain; + } + if (eq_state != nullptr) + { + eq_state[0] = low_pass_state; + eq_state[1] = high_pass_state; + } + } +} // extern "C" \ No newline at end of file diff --git a/src/simd/simd_sse/CMakeLists.txt b/src/simd/simd_sse/CMakeLists.txt new file mode 100644 index 0000000..19d843a --- /dev/null +++ b/src/simd/simd_sse/CMakeLists.txt @@ -0,0 +1,7 @@ +project(alicho_simd_sse) + +if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64") + simple_library(SHARED) + target_compile_options(${PROJECT_NAME} PRIVATE -msse4.2) + target_link_libraries(${PROJECT_NAME} PUBLIC alicho_simd_interface) +endif() diff --git a/src/simd/simd_sse/simd_func.cpp b/src/simd/simd_sse/simd_func.cpp new file mode 100644 index 0000000..bb41725 --- /dev/null +++ b/src/simd/simd_sse/simd_func.cpp @@ -0,0 +1,699 @@ +/** + * @file x86_sse_audio_processing_func.cpp + * @brief x86 SSE音频处理函数实现 + */ + +#include "simd_interface.h" + +#include +#include +#include "aligned_allocator.h" + +extern "C" +{ + SIMD_EXPORT void fill_buffer(float *buffer, float value, size_t num_samples) + { + ASSERT_ALIGNED(buffer, ALIGNMENT_SSE); + + constexpr size_t simd_width = 4; + constexpr size_t unroll_factor = 4; + size_t i = 0; + + auto value_vec = _mm_set1_ps(value); + + for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) + { + _mm_store_ps(&buffer[i], value_vec); + _mm_store_ps(&buffer[i + 4], value_vec); + _mm_store_ps(&buffer[i + 8], value_vec); + _mm_store_ps(&buffer[i + 12], value_vec); + } + + for (; i + simd_width <= num_samples; i += simd_width) + { + _mm_store_ps(&buffer[i], value_vec); + } + + for (; i < num_samples; ++i) + { + buffer[i] = value; + } + } + + SIMD_EXPORT void mix_audio(const float *src1, const float *src2, float *dst, size_t num_samples) + { + ASSERT_ALIGNED(src1, ALIGNMENT_SSE); + ASSERT_ALIGNED(src2, ALIGNMENT_SSE); + ASSERT_ALIGNED(dst, ALIGNMENT_SSE); + + constexpr size_t simd_width = 4; + constexpr size_t unroll_factor = 4; + size_t i = 0; + + for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) + { + auto a0 = _mm_load_ps(&src1[i]); + auto a1 = _mm_load_ps(&src1[i + 4]); + auto a2 = _mm_load_ps(&src1[i + 8]); + auto a3 = _mm_load_ps(&src1[i + 12]); + + auto b0 = _mm_load_ps(&src2[i]); + auto b1 = _mm_load_ps(&src2[i + 4]); + auto b2 = _mm_load_ps(&src2[i + 8]); + auto b3 = _mm_load_ps(&src2[i + 12]); + + auto result0 = _mm_add_ps(a0, b0); + auto result1 = _mm_add_ps(a1, b1); + auto result2 = _mm_add_ps(a2, b2); + auto result3 = _mm_add_ps(a3, b3); + + _mm_store_ps(&dst[i], result0); + _mm_store_ps(&dst[i + 4], result1); + _mm_store_ps(&dst[i + 8], result2); + _mm_store_ps(&dst[i + 12], result3); + } + + for (; i + simd_width <= num_samples; i += simd_width) + { + auto a = _mm_load_ps(&src1[i]); + auto b = _mm_load_ps(&src2[i]); + auto result = _mm_add_ps(a, b); + _mm_store_ps(&dst[i], result); + } + + for (; i < num_samples; ++i) + { + dst[i] = src1[i] + src2[i]; + } + } + + SIMD_EXPORT void apply_gain(const float *src, float *dst, float gain, size_t num_samples) + { + ASSERT_ALIGNED(src, ALIGNMENT_SSE); + ASSERT_ALIGNED(dst, ALIGNMENT_SSE); + + constexpr size_t simd_width = 4; + constexpr size_t unroll_factor = 4; + size_t i = 0; + + auto gain_vec = _mm_set1_ps(gain); + + for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) + { + auto a0 = _mm_load_ps(&src[i]); + auto a1 = _mm_load_ps(&src[i + 4]); + auto a2 = _mm_load_ps(&src[i + 8]); + auto a3 = _mm_load_ps(&src[i + 12]); + + auto result0 = _mm_mul_ps(a0, gain_vec); + auto result1 = _mm_mul_ps(a1, gain_vec); + auto result2 = _mm_mul_ps(a2, gain_vec); + auto result3 = _mm_mul_ps(a3, gain_vec); + + _mm_store_ps(&dst[i], result0); + _mm_store_ps(&dst[i + 4], result1); + _mm_store_ps(&dst[i + 8], result2); + _mm_store_ps(&dst[i + 12], result3); + } + + for (; i + simd_width <= num_samples; i += simd_width) + { + auto a = _mm_load_ps(&src[i]); + auto result = _mm_mul_ps(a, gain_vec); + _mm_store_ps(&dst[i], result); + } + + for (; i < num_samples; ++i) + { + dst[i] = src[i] * gain; + } + } + + SIMD_EXPORT float calculate_rms(const float *src, size_t num_samples) + { + ASSERT_ALIGNED(src, ALIGNMENT_SSE); + + constexpr size_t simd_width = 4; + constexpr size_t unroll_factor = 4; + size_t i = 0; + auto sum_squares0 = _mm_setzero_ps(); + auto sum_squares1 = _mm_setzero_ps(); + auto sum_squares2 = _mm_setzero_ps(); + auto sum_squares3 = _mm_setzero_ps(); + + for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) + { + const auto a0 = _mm_load_ps(&src[i]); + const auto a1 = _mm_load_ps(&src[i + 4]); + const auto a2 = _mm_load_ps(&src[i + 8]); + const auto a3 = _mm_load_ps(&src[i + 12]); + + const auto squared0 = _mm_mul_ps(a0, a0); + const auto squared1 = _mm_mul_ps(a1, a1); + const auto squared2 = _mm_mul_ps(a2, a2); + const auto squared3 = _mm_mul_ps(a3, a3); + + sum_squares0 = _mm_add_ps(sum_squares0, squared0); + sum_squares1 = _mm_add_ps(sum_squares1, squared1); + sum_squares2 = _mm_add_ps(sum_squares2, squared2); + sum_squares3 = _mm_add_ps(sum_squares3, squared3); + } + + auto sum_squares = _mm_add_ps(_mm_add_ps(sum_squares0, sum_squares1), + _mm_add_ps(sum_squares2, sum_squares3)); + + for (; i + simd_width <= num_samples; i += simd_width) + { + const auto a = _mm_load_ps(&src[i]); + const auto squared = _mm_mul_ps(a, a); + sum_squares = _mm_add_ps(sum_squares, squared); + } + + auto hadd1 = _mm_hadd_ps(sum_squares, sum_squares); + auto hadd2 = _mm_hadd_ps(hadd1, hadd1); + double total_sum = _mm_cvtss_f32(hadd2); + + for (; i < num_samples; ++i) + { + total_sum += static_cast(src[i]) * static_cast(src[i]); + } + + return static_cast(std::sqrt(total_sum / static_cast(num_samples))); + } + + SIMD_EXPORT float calculate_peak(const float *src, size_t num_samples) + { + ASSERT_ALIGNED(src, ALIGNMENT_SSE); + + constexpr size_t simd_width = 4; + constexpr size_t unroll_factor = 4; + size_t i = 0; + auto peak_vec0 = _mm_setzero_ps(); + auto peak_vec1 = _mm_setzero_ps(); + auto peak_vec2 = _mm_setzero_ps(); + auto peak_vec3 = _mm_setzero_ps(); + + for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) + { + const auto a0 = _mm_load_ps(&src[i]); + const auto a1 = _mm_load_ps(&src[i + 4]); + const auto a2 = _mm_load_ps(&src[i + 8]); + const auto a3 = _mm_load_ps(&src[i + 12]); + + const auto abs_a0 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a0); + const auto abs_a1 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a1); + const auto abs_a2 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a2); + const auto abs_a3 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a3); + + peak_vec0 = _mm_max_ps(peak_vec0, abs_a0); + peak_vec1 = _mm_max_ps(peak_vec1, abs_a1); + peak_vec2 = _mm_max_ps(peak_vec2, abs_a2); + peak_vec3 = _mm_max_ps(peak_vec3, abs_a3); + } + + auto peak_vec = _mm_max_ps(_mm_max_ps(peak_vec0, peak_vec1), + _mm_max_ps(peak_vec2, peak_vec3)); + + for (; i + simd_width <= num_samples; i += simd_width) + { + const auto a = _mm_load_ps(&src[i]); + const auto abs_a = _mm_andnot_ps(_mm_set1_ps(-0.0f), a); + peak_vec = _mm_max_ps(peak_vec, abs_a); + } + + auto temp1 = _mm_shuffle_ps(peak_vec, peak_vec, _MM_SHUFFLE(2, 3, 0, 1)); + auto max1 = _mm_max_ps(peak_vec, temp1); + auto temp2 = _mm_shuffle_ps(max1, max1, _MM_SHUFFLE(1, 0, 3, 2)); + auto final_max = _mm_max_ps(max1, temp2); + float peak = _mm_cvtss_f32(final_max); + + for (; i < num_samples; ++i) + { + float abs_sample = std::fabs(src[i]); + if (abs_sample > peak) + { + peak = abs_sample; + } + } + + return peak; + } + + SIMD_EXPORT void normalize_audio(const float *src, float *dst, float target_peak, size_t num_samples) + { + ASSERT_ALIGNED(src, ALIGNMENT_SSE); + ASSERT_ALIGNED(dst, ALIGNMENT_SSE); + + if (num_samples == 0 || target_peak <= 0.0f) + { + return; + } + + const float current_peak = calculate_peak(src, num_samples); + + if (current_peak < 1e-10f) + { + constexpr size_t simd_width = 4; + constexpr size_t unroll_factor = 4; + auto zero_vec = _mm_setzero_ps(); + size_t i = 0; + + for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) + { + _mm_store_ps(&dst[i], zero_vec); + _mm_store_ps(&dst[i + 4], zero_vec); + _mm_store_ps(&dst[i + 8], zero_vec); + _mm_store_ps(&dst[i + 12], zero_vec); + } + + for (; i + simd_width <= num_samples; i += simd_width) + { + _mm_store_ps(&dst[i], zero_vec); + } + + for (; i < num_samples; ++i) + { + dst[i] = 0.0f; + } + return; + } + + const float gain_factor = target_peak / current_peak; + apply_gain(src, dst, gain_factor, num_samples); + } + + SIMD_EXPORT void stereo_to_mono(const float *stereo_src, float *mono_dst, size_t num_stereo_samples) + { + ASSERT_ALIGNED(stereo_src, ALIGNMENT_SSE); + ASSERT_ALIGNED(mono_dst, ALIGNMENT_SSE); + + if (num_stereo_samples == 0) + { + return; + } + + constexpr size_t simd_width = 4; + constexpr size_t unroll_factor = 4; + const auto half_vec = _mm_set1_ps(0.5f); + size_t stereo_idx = 0; + size_t mono_idx = 0; + + for (; stereo_idx + simd_width * 2 * unroll_factor <= num_stereo_samples * 2; + stereo_idx += simd_width * 2 * unroll_factor, mono_idx += simd_width * unroll_factor) + { + auto stereo0 = _mm_load_ps(&stereo_src[stereo_idx]); + auto stereo1 = _mm_load_ps(&stereo_src[stereo_idx + 4]); + auto stereo2 = _mm_load_ps(&stereo_src[stereo_idx + 8]); + auto stereo3 = _mm_load_ps(&stereo_src[stereo_idx + 12]); + auto stereo4 = _mm_load_ps(&stereo_src[stereo_idx + 16]); + auto stereo5 = _mm_load_ps(&stereo_src[stereo_idx + 20]); + auto stereo6 = _mm_load_ps(&stereo_src[stereo_idx + 24]); + auto stereo7 = _mm_load_ps(&stereo_src[stereo_idx + 28]); + + auto left0 = _mm_shuffle_ps(stereo0, stereo1, _MM_SHUFFLE(2, 0, 2, 0)); + auto right0 = _mm_shuffle_ps(stereo0, stereo1, _MM_SHUFFLE(3, 1, 3, 1)); + auto left1 = _mm_shuffle_ps(stereo2, stereo3, _MM_SHUFFLE(2, 0, 2, 0)); + auto right1 = _mm_shuffle_ps(stereo2, stereo3, _MM_SHUFFLE(3, 1, 3, 1)); + auto left2 = _mm_shuffle_ps(stereo4, stereo5, _MM_SHUFFLE(2, 0, 2, 0)); + auto right2 = _mm_shuffle_ps(stereo4, stereo5, _MM_SHUFFLE(3, 1, 3, 1)); + auto left3 = _mm_shuffle_ps(stereo6, stereo7, _MM_SHUFFLE(2, 0, 2, 0)); + auto right3 = _mm_shuffle_ps(stereo6, stereo7, _MM_SHUFFLE(3, 1, 3, 1)); + + auto mono0 = _mm_mul_ps(_mm_add_ps(left0, right0), half_vec); + auto mono1 = _mm_mul_ps(_mm_add_ps(left1, right1), half_vec); + auto mono2 = _mm_mul_ps(_mm_add_ps(left2, right2), half_vec); + auto mono3 = _mm_mul_ps(_mm_add_ps(left3, right3), half_vec); + + _mm_store_ps(&mono_dst[mono_idx], mono0); + _mm_store_ps(&mono_dst[mono_idx + 4], mono1); + _mm_store_ps(&mono_dst[mono_idx + 8], mono2); + _mm_store_ps(&mono_dst[mono_idx + 12], mono3); + } + + for (size_t i = stereo_idx / 2; i < num_stereo_samples; ++i) + { + const float left = stereo_src[i * 2]; + const float right = stereo_src[i * 2 + 1]; + mono_dst[i] = (left + right) * 0.5f; + } + } + + SIMD_EXPORT void limit_audio(const float *src, float *dst, float threshold, float *limiter_state, float sample_rate, + size_t num_samples) + { + ASSERT_ALIGNED(src, ALIGNMENT_SSE); + ASSERT_ALIGNED(dst, ALIGNMENT_SSE); + + if (num_samples == 0 || threshold <= 0.0f) + { + return; + } + + constexpr size_t simd_width = 4; + constexpr size_t unroll_factor = 4; + constexpr float release_time = 0.05f; + float release_coeff = std::exp(-1.0f / (release_time * sample_rate)); + + float current_gain = limiter_state != nullptr ? *limiter_state : 1.0f; + + size_t i = 0; + + for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) + { + auto a0 = _mm_load_ps(&src[i]); + auto a1 = _mm_load_ps(&src[i + 4]); + auto a2 = _mm_load_ps(&src[i + 8]); + auto a3 = _mm_load_ps(&src[i + 12]); + + auto abs_a0 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a0); + auto abs_a1 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a1); + auto abs_a2 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a2); + auto abs_a3 = _mm_andnot_ps(_mm_set1_ps(-0.0f), a3); + + auto max_abs = _mm_max_ps(_mm_max_ps(abs_a0, abs_a1), + _mm_max_ps(abs_a2, abs_a3)); + + auto temp1 = _mm_shuffle_ps(max_abs, max_abs, _MM_SHUFFLE(2, 3, 0, 1)); + auto max1 = _mm_max_ps(max_abs, temp1); + auto temp2 = _mm_shuffle_ps(max1, max1, _MM_SHUFFLE(1, 0, 3, 2)); + auto final_max = _mm_max_ps(max1, temp2); + + float max_sample = _mm_cvtss_f32(final_max); + + float target_gain = max_sample > threshold ? threshold / max_sample : 1.0f; + + if (target_gain < current_gain) + { + current_gain = target_gain; + } + else + { + current_gain = target_gain + (current_gain - target_gain) * release_coeff; + } + + auto gain_vec = _mm_set1_ps(current_gain); + + auto result0 = _mm_mul_ps(a0, gain_vec); + auto result1 = _mm_mul_ps(a1, gain_vec); + auto result2 = _mm_mul_ps(a2, gain_vec); + auto result3 = _mm_mul_ps(a3, gain_vec); + + _mm_store_ps(&dst[i], result0); + _mm_store_ps(&dst[i + 4], result1); + _mm_store_ps(&dst[i + 8], result2); + _mm_store_ps(&dst[i + 12], result3); + } + + for (; i + simd_width <= num_samples; i += simd_width) + { + auto a = _mm_load_ps(&src[i]); + auto abs_a = _mm_andnot_ps(_mm_set1_ps(-0.0f), a); + + auto max_abs = abs_a; + + auto temp1 = _mm_shuffle_ps(max_abs, max_abs, _MM_SHUFFLE(2, 3, 0, 1)); + auto max1 = _mm_max_ps(max_abs, temp1); + auto temp2 = _mm_shuffle_ps(max1, max1, _MM_SHUFFLE(1, 0, 3, 2)); + auto final_max = _mm_max_ps(max1, temp2); + + float max_sample = _mm_cvtss_f32(final_max); + + float target_gain = max_sample > threshold ? threshold / max_sample : 1.0f; + + if (target_gain < current_gain) + { + current_gain = target_gain; + } + else + { + current_gain = target_gain + (current_gain - target_gain) * release_coeff; + } + + auto gain_vec = _mm_set1_ps(current_gain); + auto result = _mm_mul_ps(a, gain_vec); + _mm_store_ps(&dst[i], result); + } + + for (; i < num_samples; ++i) + { + float sample = src[i]; + float abs_sample = std::fabs(sample); + + float target_gain = abs_sample > threshold ? threshold / abs_sample : 1.0f; + + if (target_gain < current_gain) + { + current_gain = target_gain; + } + else + { + current_gain = target_gain + (current_gain - target_gain) * release_coeff; + } + + dst[i] = sample * current_gain; + } + + if (limiter_state != nullptr) + { + *limiter_state = current_gain; + } + } + + SIMD_EXPORT void fade_audio(const float *src, float *dst, size_t fade_in_samples, size_t fade_out_samples, + size_t num_samples) + { + ASSERT_ALIGNED(src, ALIGNMENT_SSE); + ASSERT_ALIGNED(dst, ALIGNMENT_SSE); + + if (num_samples == 0) + { + return; + } + + constexpr size_t simd_width = 4; + constexpr size_t unroll_factor = 4; + size_t i = 0; + + if (fade_in_samples > 0) + { + const float fade_in_step = 1.0f / static_cast(fade_in_samples); + + for (; i + simd_width * unroll_factor <= std::min(fade_in_samples, num_samples); i += simd_width * + unroll_factor) + { + auto gain0 = _mm_set_ps((i + 3) * fade_in_step, (i + 2) * fade_in_step, (i + 1) * fade_in_step, + i * fade_in_step); + auto gain1 = _mm_set_ps((i + 7) * fade_in_step, (i + 6) * fade_in_step, (i + 5) * fade_in_step, + (i + 4) * fade_in_step); + auto gain2 = _mm_set_ps((i + 11) * fade_in_step, (i + 10) * fade_in_step, (i + 9) * fade_in_step, + (i + 8) * fade_in_step); + auto gain3 = _mm_set_ps((i + 15) * fade_in_step, (i + 14) * fade_in_step, (i + 13) * fade_in_step, + (i + 12) * fade_in_step); + + auto a0 = _mm_load_ps(&src[i]); + auto a1 = _mm_load_ps(&src[i + 4]); + auto a2 = _mm_load_ps(&src[i + 8]); + auto a3 = _mm_load_ps(&src[i + 12]); + + auto result0 = _mm_mul_ps(a0, gain0); + auto result1 = _mm_mul_ps(a1, gain1); + auto result2 = _mm_mul_ps(a2, gain2); + auto result3 = _mm_mul_ps(a3, gain3); + + _mm_store_ps(&dst[i], result0); + _mm_store_ps(&dst[i + 4], result1); + _mm_store_ps(&dst[i + 8], result2); + _mm_store_ps(&dst[i + 12], result3); + } + + for (; i < std::min(fade_in_samples, num_samples); ++i) + { + const float gain = static_cast(i) / static_cast(fade_in_samples); + dst[i] = src[i] * gain; + } + } + + const size_t middle_start = fade_in_samples; + const size_t middle_end = num_samples > fade_out_samples ? num_samples - fade_out_samples : 0; + + if (middle_end > middle_start) + { + for (size_t j = middle_start; j + simd_width * unroll_factor <= middle_end; j += simd_width * + unroll_factor) + { + auto a0 = _mm_load_ps(&src[j]); + auto a1 = _mm_load_ps(&src[j + 4]); + auto a2 = _mm_load_ps(&src[j + 8]); + auto a3 = _mm_load_ps(&src[j + 12]); + + _mm_store_ps(&dst[j], a0); + _mm_store_ps(&dst[j + 4], a1); + _mm_store_ps(&dst[j + 8], a2); + _mm_store_ps(&dst[j + 12], a3); + } + + for (size_t j = middle_start + ((middle_end - middle_start) / (simd_width * unroll_factor)) * (simd_width * + unroll_factor); + j < middle_end; ++j) + { + dst[j] = src[j]; + } + } + + if (fade_out_samples > 0 && num_samples > fade_out_samples) + { + const size_t fade_out_start = num_samples - fade_out_samples; + const float fade_out_step = 1.0f / static_cast(fade_out_samples); + + for (size_t j = fade_out_start; j + simd_width * unroll_factor <= num_samples; j += simd_width * + unroll_factor) + { + const size_t fade_out_offset = j - fade_out_start; + auto gain0 = _mm_set_ps(1.0f - (fade_out_offset + 3) * fade_out_step, + 1.0f - (fade_out_offset + 2) * fade_out_step, + 1.0f - (fade_out_offset + 1) * fade_out_step, + 1.0f - fade_out_offset * fade_out_step); + auto gain1 = _mm_set_ps(1.0f - (fade_out_offset + 7) * fade_out_step, + 1.0f - (fade_out_offset + 6) * fade_out_step, + 1.0f - (fade_out_offset + 5) * fade_out_step, + 1.0f - (fade_out_offset + 4) * fade_out_step); + auto gain2 = _mm_set_ps(1.0f - (fade_out_offset + 11) * fade_out_step, + 1.0f - (fade_out_offset + 10) * fade_out_step, + 1.0f - (fade_out_offset + 9) * fade_out_step, + 1.0f - (fade_out_offset + 8) * fade_out_step); + auto gain3 = _mm_set_ps(1.0f - (fade_out_offset + 15) * fade_out_step, + 1.0f - (fade_out_offset + 14) * fade_out_step, + 1.0f - (fade_out_offset + 13) * fade_out_step, + 1.0f - (fade_out_offset + 12) * fade_out_step); + + auto a0 = _mm_load_ps(&src[j]); + auto a1 = _mm_load_ps(&src[j + 4]); + auto a2 = _mm_load_ps(&src[j + 8]); + auto a3 = _mm_load_ps(&src[j + 12]); + + auto result0 = _mm_mul_ps(a0, gain0); + auto result1 = _mm_mul_ps(a1, gain1); + auto result2 = _mm_mul_ps(a2, gain2); + auto result3 = _mm_mul_ps(a3, gain3); + + _mm_store_ps(&dst[j], result0); + _mm_store_ps(&dst[j + 4], result1); + _mm_store_ps(&dst[j + 8], result2); + _mm_store_ps(&dst[j + 12], result3); + } + + for (size_t j = fade_out_start + ((fade_out_samples / (simd_width * unroll_factor)) * (simd_width * + unroll_factor)); + j < num_samples; ++j) + { + const size_t fade_out_offset = j - fade_out_start; + const float gain = 1.0f - static_cast(fade_out_offset) / static_cast(fade_out_samples); + dst[j] = src[j] * gain; + } + } + } + + SIMD_EXPORT void simple_eq(const float *src, float *dst, float low_gain, float mid_gain, float high_gain, float *eq_state, + size_t num_samples) + { + ASSERT_ALIGNED(src, ALIGNMENT_SSE); + ASSERT_ALIGNED(dst, ALIGNMENT_SSE); + + if (num_samples == 0) + { + return; + } + + constexpr size_t simd_width = 4; + constexpr size_t unroll_factor = 4; + + constexpr float low_cutoff = 0.02f; + constexpr float high_cutoff = 0.1f; + constexpr float mid_factor = 0.7f; + + float low_state = eq_state != nullptr ? *eq_state : 0.0f; + float high_state = eq_state != nullptr ? *(eq_state + 1) : 0.0f; + + const auto low_gain_vec = _mm_set1_ps(low_gain); + const auto mid_gain_vec = _mm_set1_ps(mid_gain); + const auto high_gain_vec = _mm_set1_ps(high_gain); + const auto low_cutoff_vec = _mm_set1_ps(low_cutoff); + const auto high_cutoff_vec = _mm_set1_ps(high_cutoff); + const auto mid_factor_vec = _mm_set1_ps(mid_factor); + const auto one_minus_low_cutoff_vec = _mm_set1_ps(1.0f - low_cutoff); + const auto one_minus_high_cutoff_vec = _mm_set1_ps(1.0f - high_cutoff); + + size_t i = 0; + + for (; i + simd_width * unroll_factor <= num_samples; i += simd_width * unroll_factor) + { + auto input0 = _mm_load_ps(&src[i]); + auto input1 = _mm_load_ps(&src[i + 4]); + auto input2 = _mm_load_ps(&src[i + 8]); + auto input3 = _mm_load_ps(&src[i + 12]); + + auto low_state_vec = _mm_set1_ps(low_state); + auto low0 = _mm_add_ps(_mm_mul_ps(input0, low_cutoff_vec), + _mm_mul_ps(low_state_vec, one_minus_low_cutoff_vec)); + auto low1 = _mm_add_ps(_mm_mul_ps(input1, low_cutoff_vec), _mm_mul_ps(low0, one_minus_low_cutoff_vec)); + auto low2 = _mm_add_ps(_mm_mul_ps(input2, low_cutoff_vec), _mm_mul_ps(low1, one_minus_low_cutoff_vec)); + auto low3 = _mm_add_ps(_mm_mul_ps(input3, low_cutoff_vec), _mm_mul_ps(low2, one_minus_low_cutoff_vec)); + + auto high0 = _mm_sub_ps(input0, low0); + auto high1 = _mm_sub_ps(input1, low1); + auto high2 = _mm_sub_ps(input2, low2); + auto high3 = _mm_sub_ps(input3, low3); + + auto high_state_vec = _mm_set1_ps(high_state); + high0 = _mm_add_ps(_mm_mul_ps(high0, high_cutoff_vec), + _mm_mul_ps(high_state_vec, one_minus_high_cutoff_vec)); + high1 = _mm_add_ps(_mm_mul_ps(high1, high_cutoff_vec), _mm_mul_ps(high0, one_minus_high_cutoff_vec)); + high2 = _mm_add_ps(_mm_mul_ps(high2, high_cutoff_vec), _mm_mul_ps(high1, one_minus_high_cutoff_vec)); + high3 = _mm_add_ps(_mm_mul_ps(high3, high_cutoff_vec), _mm_mul_ps(high2, one_minus_high_cutoff_vec)); + + auto mid0 = _mm_mul_ps(_mm_sub_ps(_mm_sub_ps(input0, low0), high0), mid_factor_vec); + auto mid1 = _mm_mul_ps(_mm_sub_ps(_mm_sub_ps(input1, low1), high1), mid_factor_vec); + auto mid2 = _mm_mul_ps(_mm_sub_ps(_mm_sub_ps(input2, low2), high2), mid_factor_vec); + auto mid3 = _mm_mul_ps(_mm_sub_ps(_mm_sub_ps(input3, low3), high3), mid_factor_vec); + + auto result0 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(low0, low_gain_vec), _mm_mul_ps(mid0, mid_gain_vec)), + _mm_mul_ps(high0, high_gain_vec)); + auto result1 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(low1, low_gain_vec), _mm_mul_ps(mid1, mid_gain_vec)), + _mm_mul_ps(high1, high_gain_vec)); + auto result2 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(low2, low_gain_vec), _mm_mul_ps(mid2, mid_gain_vec)), + _mm_mul_ps(high2, high_gain_vec)); + auto result3 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(low3, low_gain_vec), _mm_mul_ps(mid3, mid_gain_vec)), + _mm_mul_ps(high3, high_gain_vec)); + + _mm_store_ps(&dst[i], result0); + _mm_store_ps(&dst[i + 4], result1); + _mm_store_ps(&dst[i + 8], result2); + _mm_store_ps(&dst[i + 12], result3); + + low_state = _mm_cvtss_f32(_mm_shuffle_ps(low3, low3, _MM_SHUFFLE(3, 3, 3, 3))); + high_state = _mm_cvtss_f32(_mm_shuffle_ps(high3, high3, _MM_SHUFFLE(3, 3, 3, 3))); + } + + for (; i < num_samples; ++i) + { + float input = src[i]; + + float low_output = low_cutoff * input + (1.0f - low_cutoff) * low_state; + low_state = low_output; + + float high_input = input - low_output; + float high_output = high_cutoff * high_input + (1.0f - high_cutoff) * high_state; + high_state = high_output; + + float mid_output = (input - low_output - high_output) * mid_factor; + + dst[i] = low_output * low_gain + mid_output * mid_gain + high_output * high_gain; + } + + if (eq_state != nullptr) + { + *eq_state = low_state; + *(eq_state + 1) = high_state; + } + } +} \ No newline at end of file diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e756c4e..003c0a3 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -10,6 +10,7 @@ enable_testing() include(cmake/test_helpers.cmake) # 添加各测试模块 +add_subdirectory(helpers) add_subdirectory(simd) add_subdirectory(network) add_subdirectory(shm) diff --git a/tests/helpers/CMakeLists.txt b/tests/helpers/CMakeLists.txt new file mode 100644 index 0000000..abf6d47 --- /dev/null +++ b/tests/helpers/CMakeLists.txt @@ -0,0 +1,4 @@ +project(alicho_test_helper) + +simple_library(STATIC) +target_link_libraries(${PROJECT_NAME} PUBLIC GTest::gtest GTest::gtest_main audio_backend_project_options) diff --git a/tests/simd/CMakeLists.txt b/tests/simd/CMakeLists.txt index 27f951a..36b6f82 100644 --- a/tests/simd/CMakeLists.txt +++ b/tests/simd/CMakeLists.txt @@ -7,7 +7,7 @@ add_module_test( TARGET test_simd_basic SOURCE_FILE test_simd_basic.cpp MODULE "SIMD" - LINK_LIBRARIES alicho_simd + LINK_LIBRARIES alicho_simd alicho_simd_interface alicho_test_helper ) # SIMD 音频处理测试 @@ -15,7 +15,7 @@ add_module_test( TARGET test_simd_audio_processing SOURCE_FILE test_simd_audio_processing.cpp MODULE "SIMD" - LINK_LIBRARIES alicho_simd + LINK_LIBRARIES alicho_simd alicho_simd_interface alicho_test_helper ) # 自定义目标:运行 SIMD 测试 diff --git a/tests/simd/test_simd_audio_processing.cpp b/tests/simd/test_simd_audio_processing.cpp index d5b91d1..8da388b 100644 --- a/tests/simd/test_simd_audio_processing.cpp +++ b/tests/simd/test_simd_audio_processing.cpp @@ -1,799 +1,727 @@ -/** - * @file test_audio_processing_comprehensive.cpp - * @brief 音频处理函数综合测试套件 - * - * 测试覆盖: - * - 9个音频处理函数确性测试 - * - 标量与SIMD版本的一致性测试 - * - 边界条件和错误处理测试 - * - 性能对比测试 - * - 跨平台兼容性测试 - */ #include #include #include -#include -#include -#include +#include #include #include -#include +#include +#include -// 确保M_PI定义可用 -#ifndef M_PI -#define M_PI 3.14159265358979323846 -#endif - -// 音频处理函数头文件 +// 修正include路径 +#include "simd_api.h" #include "aligned_allocator.h" -#include "simd_func_dispatcher.h" -#include "audio_processing/scalar_audio_processing_func.h" -#include "audio_processing/simd_audio_processing.h" +#include "performance_timer.h" -#if ALICHO_PLATFORM_X86 -#include "audio_processing/x86_simd_audio_processing_func.h" -#endif +using namespace shm_test; -#if ALICHO_PLATFORM_ARM -#include "audio_processing/arm_simd_audio_processing_func.h" -#endif +namespace { -// 测试容差设置 -constexpr float FLOAT_TOLERANCE = 1e-6f; -constexpr float RMS_TOLERANCE = 1e-5f; -constexpr float PEAK_TOLERANCE = 1e-6f; - -// 性能测试设置 -constexpr size_t PERF_TEST_SIZE = 1024 * 1024; // 1M samples -constexpr int PERF_TEST_ITERATIONS = 100; - -using aligned_audio_buffer = std::vector>; +// ============================================================================ +// 测试数据生成辅助函数 +// ============================================================================ /** - * 浮点数比较函数 + * @brief 生成正弦波测试数据 + * @param frequency 频率(0-1,相对于采样率) + * @param num_samples 样本数量 + * @return 正弦波数据向量 */ -bool float_equal(float a, float b, float tolerance = FLOAT_TOLERANCE) { - if (std::isnan(a) && std::isnan(b)) - return true; - if (std::isinf(a) && std::isinf(b)) - return (a > 0) == (b > 0); - return std::abs(a - b) <= tolerance; +template> +std::vector generate_sine_wave(float frequency, size_t num_samples) { + std::vector data(num_samples); + for (size_t i = 0; i < num_samples; ++i) { + data[i] = std::sin(2.0f * 3.14159265f * frequency * i); + } + return data; } /** - * 数组比较函数 + * @brief 生成常数值测试数据 + * @param value 常数值 + * @param num_samples 样本数量 + * @return 常数值数据向量 */ -bool arrays_equal(const float* arr1, const float* arr2, size_t size, float tolerance = FLOAT_TOLERANCE) { - for (size_t i = 0; i < size; ++i) { - if (!float_equal(arr1[i], arr2[i], tolerance)) { - std::cout << " 差异在位置 " << i << ": " << arr1[i] << " vs " << arr2[i] - << " (差值: " << std::abs(arr1[i] - arr2[i]) << ")" << std::endl; - return false; - } - } - return true; +template> +std::vector generate_constant_data(float value, size_t num_samples) { + return std::vector(num_samples, value); } /** - * 测试数据生成器 + * @brief 检查两个浮点数是否接近 + * @param a 第一个值 + * @param b 第二个值 + * @param tolerance 容差 + * @return 是否接近 */ -class AudioDataGenerator { -private: - mutable std::mt19937 rng_{std::random_device{}()}; +bool float_near(float a, float b, float tolerance = 1e-5f) { + if (std::isnan(a) || std::isnan(b)) return false; + if (std::isinf(a) || std::isinf(b)) return a == b; + return std::abs(a - b) < tolerance; +} -public: - // 生成正弦波 - auto generate_sine_wave(size_t num_samples, float frequency = 440.0f, - float sample_rate = 44100.0f, float amplitude = 1.0f) const { - std::vector> data(num_samples); - for (size_t i = 0; i < num_samples; ++i) { - data[i] = amplitude * std::sin(2.0f * M_PI * frequency * i / sample_rate); - } - return data; - } +/** + * @brief 检查两个缓冲区是否相等(允许浮点误差) + * @param expected 期望值 + * @param actual 实际值 + * @param tolerance 容差 + * @return 是否相等 + */ +template +bool buffers_equal(const std::vector& expected, + const std::vector& actual, + float tolerance = 1e-5f) { + if (expected.size() != actual.size()) { + return false; + } + for (size_t i = 0; i < expected.size(); ++i) { + if (!float_near(expected[i], actual[i], tolerance)) { + return false; + } + } + return true; +} - // 生成白噪声 - auto generate_white_noise(size_t num_samples, float amplitude = 1.0f) const { - std::vector> data(num_samples); - std::uniform_real_distribution dist(-amplitude, amplitude); - for (size_t i = 0; i < num_samples; ++i) { - data[i] = dist(rng_); - } - return data; - } +// ============================================================================ +// 功能单元测试 - 参数化测试 +// ============================================================================ - // 生成脉冲信号 - auto generate_impulse(size_t num_samples, size_t impulse_pos = 0, float amplitude = 1.0f) const { - std::vector> data(num_samples, 0.0f); - if (impulse_pos < num_samples) { - data[impulse_pos] = amplitude; - } - return data; - } - - // 生成直流信号 - aligned_audio_buffer generate_dc(size_t num_samples, float value = 1.0f) const { - return aligned_audio_buffer(num_samples, value); - } - - // 生成立体声测试数据 - aligned_audio_buffer generate_stereo_test_data(size_t num_stereo_samples) const { - aligned_audio_buffer data(num_stereo_samples * 2); - for (size_t i = 0; i < num_stereo_samples; ++i) { - data[i * 2] = std::sin(2.0f * M_PI * 440.0f * i / 44100.0f); // 左声道: 440Hz - data[i * 2 + 1] = std::sin(2.0f * M_PI * 880.0f * i / 44100.0f); // 右声道: 880Hz - } - return data; - } - - // 生成边界测试数据 - aligned_audio_buffer generate_boundary_data(size_t num_samples) const { - aligned_audio_buffer data; - data.reserve(num_samples); - - // 添加各种边界值 - if (num_samples > 0) - data.push_back(0.0f); - if (num_samples > 1) - data.push_back(1.0f); - if (num_samples > 2) - data.push_back(-1.0f); - if (num_samples > 3) - data.push_back(std::numeric_limits::min()); - if (num_samples > 4) - data.push_back(std::numeric_limits::max()); - if (num_samples > 5) - data.push_back(std::numeric_limits::epsilon()); - if (num_samples > 6) - data.push_back(-std::numeric_limits::epsilon()); - - // 填充剩余位置 - std::uniform_real_distribution dist(-1.0f, 1.0f); - while (data.size() < num_samples) { - data.push_back(dist(rng_)); - } - - return data; - } +/** + * @brief 测试参数结构体 + */ +struct AudioProcessingTestParams { + size_t buffer_size; // 缓冲区大小 + int num_channels; // 通道数 + float gain_value; // 增益值 }; /** - * 性能测试辅助类 + * @brief 填充缓冲区功能测试类 */ -class PerformanceTester { -public: - template - double measure_execution_time(Func&& func, int iterations = PERF_TEST_ITERATIONS) { - auto start = std::chrono::high_resolution_clock::now(); - - for (int i = 0; i < iterations; ++i) { - func(); - } - - auto end = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast(end - start); - - return duration.count() / 1e6 / iterations; // 返回平均毫秒数 - } - - void print_performance_comparison(const std::string& test_name, - double scalar_time, - double simd_time) { - double speedup = scalar_time / simd_time; - std::cout << "[PERF] " << test_name << std::endl; - std::cout << " 标量版本: " << std::fixed << std::setprecision(3) << scalar_time << "ms" << std::endl; - std::cout << " SIMD版本: " << std::fixed << std::setprecision(3) << simd_time << "ms" << std::endl; - std::cout << " 加速比: " << std::fixed << std::setprecision(2) << speedup << "x" << std::endl; - } -}; - -// 音频处理函数测试类 -class AudioProcessingTest : public ::testing::Test { +class FillBufferTest : public ::testing::TestWithParam { protected: - void SetUp() override { - // 初始化测试环境 - audio_processing_registry::register_all_functions(); - } - - void TearDown() override { - // 清理测试环境 - } - - // 全局实例 - AudioDataGenerator data_gen; - PerformanceTester perf_tester; + void SetUp() override { + params = GetParam(); + } + + AudioProcessingTestParams params; }; +TEST_P(FillBufferTest, FillsBufferWithValue) { + const float fill_value = 3.14f; + size_t total_samples = params.buffer_size * params.num_channels; + + std::vector buffer(total_samples); + simd::fill_buffer(buffer.data(), fill_value, total_samples); + + for (size_t i = 0; i < total_samples; ++i) { + EXPECT_FLOAT_EQ(buffer[i], fill_value) + << "值不匹配于索引 " << i; + } +} + +TEST_P(FillBufferTest, FillWithZero) { + size_t total_samples = params.buffer_size * params.num_channels; + std::vector buffer(total_samples); + + simd::fill_buffer(buffer.data(), 0.0f, total_samples); + + for (size_t i = 0; i < total_samples; ++i) { + EXPECT_EQ(buffer[i], 0.0f); + } +} + +TEST_P(FillBufferTest, FillWithNegativeValue) { + const float fill_value = -2.5f; + size_t total_samples = params.buffer_size * params.num_channels; + std::vector buffer(total_samples); + + simd::fill_buffer(buffer.data(), fill_value, total_samples); + + for (size_t i = 0; i < total_samples; ++i) { + EXPECT_FLOAT_EQ(buffer[i], fill_value); + } +} + +INSTANTIATE_TEST_SUITE_P( + FillBufferParametrized, + FillBufferTest, + ::testing::Values( + AudioProcessingTestParams{64, 1, 1.0f}, + AudioProcessingTestParams{256, 2, 1.0f}, + AudioProcessingTestParams{1024, 4, 1.0f}, + AudioProcessingTestParams{4096, 8, 1.0f}, + AudioProcessingTestParams{8192, 1, 1.0f} + ) +); + /** - * ======================================== - * 基础功能测试 - * ======================================== + * @brief 应用增益功能测试类 */ +class ApplyGainTest : public ::testing::TestWithParam { +protected: + void SetUp() override { + params = GetParam(); + } + + AudioProcessingTestParams params; +}; -// 测试 simd_audio_processing_registry 注册功能 -TEST_F(AudioProcessingTest, RegistryRegistration) { - // 打印已注册的函数以供调试 - audio_processing_registry::print_available_functions(); - - // 验证关键函数已注册 - auto& dispatcher = simd_func_dispatcher::instance(); - - EXPECT_NO_THROW({ - dispatcher.get_function("mix_audio"); - }) << "函数 mix_audio 未正确注册"; - - EXPECT_NO_THROW({ - dispatcher.get_function("apply_gain"); - }) << "函数 apply_gain 未正确注册"; - - EXPECT_NO_THROW({ - dispatcher.get_function("calculate_rms"); - }) << "函数 calculate_rms 未正确注册"; - - EXPECT_NO_THROW({ - dispatcher.get_function("calculate_peak"); - }) << "函数 calculate_peak 未正确注册"; - - EXPECT_NO_THROW({ - dispatcher.get_function("normalize_audio"); - }) << "函数 normalize_audio 未正确注册"; - - EXPECT_NO_THROW({ - dispatcher.get_function("stereo_to_mono"); - }) << "函数 stereo_to_mono 未正确注册"; +TEST_P(ApplyGainTest, AppliesGainCorrectly) { + size_t total_samples = params.buffer_size * params.num_channels; + const float gain = params.gain_value; + + std::vector src(total_samples, 2.0f); + std::vector dst(total_samples, 0.0f); + + simd::apply_gain(src.data(), dst.data(), gain, total_samples); + + const float expected_value = 2.0f * gain; + for (size_t i = 0; i < total_samples; ++i) { + EXPECT_FLOAT_EQ(dst[i], expected_value) + << "增益应用错误于索引 " << i; + } } -// 测试 mix_audio 函数 -TEST_F(AudioProcessingTest, MixAudioBasic) { - const size_t num_samples = 16; - auto src1 = data_gen.generate_sine_wave(num_samples, 440.0f); - auto src2 = data_gen.generate_sine_wave(num_samples, 880.0f); - aligned_audio_buffer result(num_samples); - aligned_audio_buffer expected(num_samples); - - // 计算期望结果 - for (size_t i = 0; i < num_samples; ++i) { - expected[i] = src1[i] + src2[i]; - } - - // 测试标量版本 - scalar_audio_processing_func::mix_audio(src1.data(), src2.data(), result.data(), num_samples); - - EXPECT_TRUE(arrays_equal(result.data(), expected.data(), num_samples)) - << "混合音频结果与期望不符"; +TEST_P(ApplyGainTest, GainZeroMakesOutputZero) { + size_t total_samples = params.buffer_size * params.num_channels; + std::vector src(total_samples, 5.0f); + std::vector dst(total_samples, 1.0f); + + simd::apply_gain(src.data(), dst.data(), 0.0f, total_samples); + + for (size_t i = 0; i < total_samples; ++i) { + EXPECT_FLOAT_EQ(dst[i], 0.0f); + } } -// 测试 apply_gain 函数 -TEST_F(AudioProcessingTest, ApplyGainBasic) { - const size_t num_samples = 16; - const float gain = 0.5f; - auto src = data_gen.generate_sine_wave(num_samples); - std::vector> result(num_samples); - std::vector> expected(num_samples); - - // 计算期望结果 - for (size_t i = 0; i < num_samples; ++i) { - expected[i] = src[i] * gain; - } - - // 测试标量版本 - scalar_audio_processing_func::apply_gain(src.data(), result.data(), gain, num_samples); - - EXPECT_TRUE(arrays_equal(result.data(), expected.data(), num_samples)) - << "增益应用结果与期望不符"; +TEST_P(ApplyGainTest, GainOnePreservesValues) { + size_t total_samples = params.buffer_size * params.num_channels; + auto raw_src = generate_sine_wave(0.1f, total_samples); + std::vector src(raw_src.begin(), raw_src.end()); + std::vector dst(total_samples, 0.0f); + + simd::apply_gain(src.data(), dst.data(), 1.0f, total_samples); + + // 验证增益为1.0时,源数据应该保持不变 + for (size_t i = 0; i < total_samples; ++i) { + EXPECT_FLOAT_EQ(src[i], raw_src[i]); + } } -// 测试 calculate_rms 函数 -TEST_F(AudioProcessingTest, CalculateRmsBasic) { - const size_t num_samples = 1024; - auto src = data_gen.generate_sine_wave(num_samples); - - // 计算期望的RMS值 - double sum_squares = 0.0; - for (size_t i = 0; i < num_samples; ++i) { - sum_squares += src[i] * src[i]; - } - float expected_rms = std::sqrt(sum_squares / num_samples); - - // 测试标量版本 - float result_rms = scalar_audio_processing_func::calculate_rms(src.data(), num_samples); - - EXPECT_TRUE(float_equal(result_rms, expected_rms, RMS_TOLERANCE)) - << "期望 RMS: " << expected_rms << ", 得到: " << result_rms; +TEST_P(ApplyGainTest, NegativeGainInvertsSignal) { + size_t total_samples = params.buffer_size * params.num_channels; + std::vector src(total_samples, 3.0f); + std::vector dst(total_samples, 0.0f); + + simd::apply_gain(src.data(), dst.data(), -1.0f, total_samples); + + for (size_t i = 0; i < total_samples; ++i) { + EXPECT_FLOAT_EQ(dst[i], -3.0f); + } } -// 测试 calculate_peak 函数 -TEST_F(AudioProcessingTest, CalculatePeakBasic) { - const size_t num_samples = 1024; - auto src = data_gen.generate_boundary_data(num_samples); +INSTANTIATE_TEST_SUITE_P( + ApplyGainParametrized, + ApplyGainTest, + ::testing::Values( + AudioProcessingTestParams{64, 1, 0.0f}, + AudioProcessingTestParams{256, 2, 0.5f}, + AudioProcessingTestParams{1024, 4, 1.0f}, + AudioProcessingTestParams{4096, 8, 2.0f}, + AudioProcessingTestParams{8192, 1, 0.1f} + ) +); - // 计算期望的峰值 - float expected_peak = 0.0f; - for (size_t i = 0; i < num_samples; ++i) { - expected_peak = std::max(expected_peak, std::abs(src[i])); - } +/** + * @brief 混音功能测试类 + */ +class MixAudioTest : public ::testing::TestWithParam { +protected: + void SetUp() override { + params = GetParam(); + } + + AudioProcessingTestParams params; +}; - // 测试标量版本 - float result_peak = scalar_audio_processing_func::calculate_peak(src.data(), num_samples); +TEST_P(MixAudioTest, MixesAudioCorrectly) { + size_t total_samples = params.buffer_size * params.num_channels; + std::vector src1(total_samples, 1.0f); + std::vector src2(total_samples, 2.0f); + std::vector dst(total_samples, 0.0f); + + simd::mix_audio(src1.data(), src2.data(), dst.data(), total_samples); + + for (size_t i = 0; i < total_samples; ++i) { + EXPECT_FLOAT_EQ(dst[i], 3.0f) + << "混音结果错误于索引 " << i; + } +} - EXPECT_TRUE(float_equal(result_peak, expected_peak, PEAK_TOLERANCE)) - << "期望峰值: " << expected_peak << ", 得到: " << result_peak; +TEST_P(MixAudioTest, MixWithZero) { + size_t total_samples = params.buffer_size * params.num_channels; + std::vector src1 = generate_sine_wave(0.1f, total_samples); + std::vector src2(total_samples, 0.0f); + std::vector dst(total_samples, 0.0f); + + simd::mix_audio(src1.data(), src2.data(), dst.data(), total_samples); + + EXPECT_TRUE(buffers_equal(src1, dst, 1e-5f)); +} + +TEST_P(MixAudioTest, MixSymmetry) { + size_t total_samples = params.buffer_size * params.num_channels; + std::vector src1 = generate_sine_wave(0.1f, total_samples); + std::vector src2 = generate_sine_wave(0.05f, total_samples); + std::vector dst1(total_samples, 0.0f); + std::vector dst2(total_samples, 0.0f); + + simd::mix_audio(src1.data(), src2.data(), dst1.data(), total_samples); + simd::mix_audio(src2.data(), src1.data(), dst2.data(), total_samples); + + EXPECT_TRUE(buffers_equal(dst1, dst2, 1e-5f)); +} + +INSTANTIATE_TEST_SUITE_P( + MixAudioParametrized, + MixAudioTest, + ::testing::Values( + AudioProcessingTestParams{64, 1, 0.0f}, + AudioProcessingTestParams{256, 2, 0.0f}, + AudioProcessingTestParams{1024, 4, 0.0f}, + AudioProcessingTestParams{4096, 8, 0.0f}, + AudioProcessingTestParams{8192, 1, 0.0f} + ) +); + +// ============================================================================ +// 边界条件测试 +// ============================================================================ + +class BoundaryConditionTest : public ::testing::Test { +protected: + const float epsilon = 1e-5f; +}; + +TEST_F(BoundaryConditionTest, FillBufferZeroLength) { + std::vector buffer(10, 1.0f); + simd::fill_buffer(buffer.data(), 0.0f, 0); // 不应该修改任何元素 + + for (auto val : buffer) { + EXPECT_EQ(val, 1.0f); + } +} + +TEST_F(BoundaryConditionTest, FillBufferSingleSample) { + std::vector buffer(1); + const float fill_value = 7.5f; + + simd::fill_buffer(buffer.data(), fill_value, 1); + + EXPECT_FLOAT_EQ(buffer[0], fill_value); +} + +TEST_F(BoundaryConditionTest, ApplyGainZeroLength) { + std::vector src(10, 5.0f); + std::vector dst(10, 1.0f); + + simd::apply_gain(src.data(), dst.data(), 2.0f, 0); + + for (auto val : dst) { + EXPECT_EQ(val, 1.0f); + } +} + +TEST_F(BoundaryConditionTest, ApplyGainSingleSample) { + std::vector src = {3.0f}; + std::vector dst = {0.0f}; + + simd::apply_gain(src.data(), dst.data(), 4.0f, 1); + + EXPECT_FLOAT_EQ(dst[0], 12.0f); +} + +TEST_F(BoundaryConditionTest, MixAudioZeroLength) { + std::vector src1(10, 1.0f); + std::vector src2(10, 2.0f); + std::vector dst(10, 0.0f); + + simd::mix_audio(src1.data(), src2.data(), dst.data(), 0); + + for (auto val : dst) { + EXPECT_EQ(val, 0.0f); + } +} + +TEST_F(BoundaryConditionTest, MixAudioSingleSample) { + std::vector src1 = {2.5f}; + std::vector src2 = {3.5f}; + std::vector dst = {0.0f}; + + simd::mix_audio(src1.data(), src2.data(), dst.data(), 1); + + EXPECT_FLOAT_EQ(dst[0], 6.0f); +} + +TEST_F(BoundaryConditionTest, ExtremeLargeValues) { + std::vector buffer(100); + const float large_value = 1e6f; + + simd::fill_buffer(buffer.data(), large_value, buffer.size()); + + for (auto val : buffer) { + EXPECT_FLOAT_EQ(val, large_value); + } +} + +TEST_F(BoundaryConditionTest, ExtremeSmallValues) { + std::vector buffer(100); + const float small_value = 1e-6f; + + simd::fill_buffer(buffer.data(), small_value, buffer.size()); + + for (auto val : buffer) { + EXPECT_NEAR(val, small_value, 1e-12f); + } +} + +TEST_F(BoundaryConditionTest, FillBufferWithMaxFloat) { + std::vector buffer(10); + const float max_val = std::numeric_limits::max(); + + simd::fill_buffer(buffer.data(), max_val, buffer.size()); + + for (auto val : buffer) { + EXPECT_EQ(val, max_val); + } +} + +TEST_F(BoundaryConditionTest, ApplyGainWithVerySmallGain) { + std::vector src(100, 1e6f); + std::vector dst(100); + const float tiny_gain = 1e-7f; + + simd::apply_gain(src.data(), dst.data(), tiny_gain, src.size()); + + for (size_t i = 0; i < src.size(); ++i) { + EXPECT_NEAR(dst[i], src[i] * tiny_gain, 1e-10f); + } +} + +// ============================================================================ +// 数据对齐测试 +// ============================================================================ + +class AlignmentTest : public ::testing::Test { +protected: + template + bool check_alignment(const void* ptr) const { + return reinterpret_cast(ptr) % Alignment == 0; + } +}; + +TEST_F(AlignmentTest, SSEAlignedBuffer) { + std::vector> buffer(1024); + EXPECT_TRUE(check_alignment(buffer.data())); +} + +TEST_F(AlignmentTest, AVXAlignedBuffer) { + std::vector> buffer(1024); + EXPECT_TRUE(check_alignment(buffer.data())); +} + +TEST_F(AlignmentTest, AVX512AlignedBuffer) { + std::vector> buffer(1024); + EXPECT_TRUE(check_alignment(buffer.data())); +} + +TEST_F(AlignmentTest, FillBufferMaintainsAlignment) { + std::vector buffer(1024); + EXPECT_TRUE(check_alignment(buffer.data())); + + simd::fill_buffer(buffer.data(), 1.0f, buffer.size()); + + // 数据仍然在原地,对齐不变 + EXPECT_TRUE(check_alignment(buffer.data())); +} + +TEST_F(AlignmentTest, ApplyGainWithAlignedData) { + std::vector src(256); + std::vector dst(256); + + EXPECT_TRUE(check_alignment(src.data())); + EXPECT_TRUE(check_alignment(dst.data())); + + simd::apply_gain(src.data(), dst.data(), 0.5f, src.size()); + + // 对齐在操作后仍然有效 + EXPECT_TRUE(check_alignment(src.data())); + EXPECT_TRUE(check_alignment(dst.data())); +} + +TEST_F(AlignmentTest, MixAudioWithAlignedData) { + std::vector src1(512); + std::vector src2(512); + std::vector dst(512); + + EXPECT_TRUE(check_alignment(src1.data())); + EXPECT_TRUE(check_alignment(src2.data())); + EXPECT_TRUE(check_alignment(dst.data())); + + simd::mix_audio(src1.data(), src2.data(), dst.data(), src1.size()); + + EXPECT_TRUE(check_alignment(dst.data())); +} + +TEST_F(AlignmentTest, UnalignedBufferHandling) { + // 创建一个未对齐的缓冲区(通过跳过第一个字节) + std::vector aligned_storage(1025); + float* unaligned_ptr = aligned_storage.data() + 1; // 偏移1个元素,破坏对齐 + + // 虽然指针未对齐,但函数应该仍然能工作(可能性能较低) + // 这测试了函数的鲁棒性,而不是对齐的要求 + std::vector expected(1024, 5.0f); + + simd::fill_buffer(unaligned_ptr, 5.0f, 1024); + + for (size_t i = 0; i < 1024; ++i) { + EXPECT_FLOAT_EQ(unaligned_ptr[i], 5.0f); + } +} + +// ============================================================================ +// 性能基准测试 +// ============================================================================ + +class PerformanceBenchmark : public ::testing::Test { +protected: + const int ITERATIONS = 100; +}; + +TEST_F(PerformanceBenchmark, FillBufferThroughput64Samples) { + std::vector buffer(64); + + LatencyRecorder latency_recorder("FillBuffer_64"); + + for (int iter = 0; iter < ITERATIONS; ++iter) { + ScopedTimer timer("", false); + simd::fill_buffer(buffer.data(), 1.0f, buffer.size()); + timer.stop(); + latency_recorder.record(timer.elapsed_ns()); + } + + auto stats = latency_recorder.get_statistics(); + double samples_per_sec = (64.0 * ITERATIONS * 1e9) / (stats.avg() * ITERATIONS); + + std::cout << "FillBuffer (64 samples): " << samples_per_sec / 1e6 << " MSamples/sec" << std::endl; + EXPECT_GT(samples_per_sec, 0.0); // 基本健全性检查 +} + +TEST_F(PerformanceBenchmark, FillBufferThroughput1024Samples) { + std::vector buffer(1024); + + LatencyRecorder latency_recorder("FillBuffer_1024"); + + for (int iter = 0; iter < ITERATIONS; ++iter) { + ScopedTimer timer("", false); + simd::fill_buffer(buffer.data(), 1.0f, buffer.size()); + timer.stop(); + latency_recorder.record(timer.elapsed_ns()); + } + + auto stats = latency_recorder.get_statistics(); + double samples_per_sec = (1024.0 * ITERATIONS * 1e9) / (stats.avg() * ITERATIONS); + + std::cout << "FillBuffer (1024 samples): " << samples_per_sec / 1e6 << " MSamples/sec" << std::endl; + EXPECT_GT(samples_per_sec, 0.0); +} + +TEST_F(PerformanceBenchmark, ApplyGainThroughput) { + std::vector src(1024, 1.0f); + std::vector dst(1024); + + LatencyRecorder latency_recorder("ApplyGain_1024"); + + for (int iter = 0; iter < ITERATIONS; ++iter) { + ScopedTimer timer("", false); + simd::apply_gain(src.data(), dst.data(), 0.5f, src.size()); + timer.stop(); + latency_recorder.record(timer.elapsed_ns()); + } + + auto stats = latency_recorder.get_statistics(); + double samples_per_sec = (1024.0 * ITERATIONS * 1e9) / (stats.avg() * ITERATIONS); + + std::cout << "ApplyGain (1024 samples): " << samples_per_sec / 1e6 << " MSamples/sec" << std::endl; + EXPECT_GT(samples_per_sec, 0.0); +} + +TEST_F(PerformanceBenchmark, MixAudioThroughput) { + std::vector src1(1024, 1.0f); + std::vector src2(1024, 2.0f); + std::vector dst(1024); + + LatencyRecorder latency_recorder("MixAudio_1024"); + + for (int iter = 0; iter < ITERATIONS; ++iter) { + ScopedTimer timer("", false); + simd::mix_audio(src1.data(), src2.data(), dst.data(), src1.size()); + timer.stop(); + latency_recorder.record(timer.elapsed_ns()); + } + + auto stats = latency_recorder.get_statistics(); + double samples_per_sec = (1024.0 * ITERATIONS * 1e9) / (stats.avg() * ITERATIONS); + + std::cout << "MixAudio (1024 samples): " << samples_per_sec / 1e6 << " MSamples/sec" << std::endl; + EXPECT_GT(samples_per_sec, 0.0); +} + +// ============================================================================ +// 跨实现一致性测试 +// ============================================================================ + +class CrossImplementationConsistencyTest : public ::testing::Test { +protected: + void SetUp() override { + // 初始化测试数据 + test_data_ = generate_sine_wave(0.1f, 1024); + expected_result_ = test_data_; + } + + std::vector test_data_; + std::vector expected_result_; +}; + +TEST_F(CrossImplementationConsistencyTest, FillBufferConsistency) { + std::vector result(1024); + + // 使用SIMD实现 + simd::fill_buffer(result.data(), 5.0f, 1024); + + // 验证所有元素都是5.0 + for (size_t i = 0; i < 1024; ++i) { + EXPECT_FLOAT_EQ(result[i], 5.0f) + << "SIMD实现结果不一致于索引 " << i; + } +} + +TEST_F(CrossImplementationConsistencyTest, ApplyGainConsistency) { + std::vector src(1024, 3.0f); + std::vector result(1024); + + // 使用SIMD实现 + simd::apply_gain(src.data(), result.data(), 2.0f, 1024); + + // 验证结果为6.0(3.0 * 2.0) + for (size_t i = 0; i < 1024; ++i) { + EXPECT_FLOAT_EQ(result[i], 6.0f) + << "SIMD实现结果不一致于索引 " << i; + } +} + +TEST_F(CrossImplementationConsistencyTest, MixAudioConsistency) { + std::vector src1(1024, 1.0f); + std::vector src2(1024, 2.0f); + std::vector result(1024); + + // 使用SIMD实现 + simd::mix_audio(src1.data(), src2.data(), result.data(), 1024); + + // 验证结果为3.0(1.0 + 2.0) + for (size_t i = 0; i < 1024; ++i) { + EXPECT_FLOAT_EQ(result[i], 3.0f) + << "SIMD实现结果不一致于索引 " << i; + } +} + +// ============================================================================ +// 额外功能测试 +// ============================================================================ + +/** + * @brief 计算RMS值的辅助函数 + */ +float calculate_rms(const std::vector& data) { + double sum = 0.0; + for (float val : data) { + sum += val * val; + } + return std::sqrt(sum / data.size()); } /** - * ======================================== - * 新增功能测试 - * ======================================== + * @brief 计算峰值辅助函数 */ - -// 测试 normalize_audio 函数 -TEST_F(AudioProcessingTest, NormalizeAudioBasic) { - const size_t num_samples = 1024; - const float target_peak = 0.8f; - auto src = data_gen.generate_sine_wave(num_samples, 440.0f, 44100.0f, 2.0f); // 超过1.0的幅度 - aligned_audio_buffer result(num_samples); - - // 测试标量版本 - scalar_audio_processing_func::normalize_audio(src.data(), result.data(), target_peak, num_samples); - - // 验证归一化后的峰值 - float actual_peak = scalar_audio_processing_func::calculate_peak(result.data(), num_samples); - - EXPECT_TRUE(float_equal(actual_peak, target_peak, PEAK_TOLERANCE)) - << "期望峰值: " << target_peak << ", 实际峰值: " << actual_peak; +float calculate_peak(const std::vector& data) { + float peak = 0.0f; + for (float val : data) { + peak = std::max(peak, std::abs(val)); + } + return peak; } -// 测试 stereo_to_mono 函数 -TEST_F(AudioProcessingTest, StereoToMonoBasic) { - const size_t num_stereo_samples = 512; - auto stereo_src = data_gen.generate_stereo_test_data(num_stereo_samples); - aligned_audio_buffer mono_result(num_stereo_samples); - aligned_audio_buffer expected_mono(num_stereo_samples); +class AdditionalFunctionalityTest : public ::testing::Test { +protected: + void SetUp() override { + test_data_ = generate_sine_wave(0.1f, 1024); + } + + std::vector test_data_; +}; - // 计算期望结果 - for (size_t i = 0; i < num_stereo_samples; ++i) { - expected_mono[i] = (stereo_src[i * 2] + stereo_src[i * 2 + 1]) * 0.5f; - } - - // 测试标量版本 - scalar_audio_processing_func::stereo_to_mono(stereo_src.data(), mono_result.data(), num_stereo_samples); - - EXPECT_TRUE(arrays_equal(mono_result.data(), expected_mono.data(), num_stereo_samples)) - << "立体声转单声道果与期望不符"; +TEST_F(AdditionalFunctionalityTest, RMSCalculation) { + std::vector src(test_data_.begin(), test_data_.end()); + std::vector dst(1024); + + // 使用SIMD实现计算RMS + float rms = simd::calculate_rms(src.data(), 1024); + + // 验证RMS值在合理范围内 + EXPECT_GT(rms, 0.0); + EXPECT_LT(rms, 1.0); + + // 验证与参考实现的一致性 + std::vector test_data_copy(test_data_.begin(), test_data_.end()); + float expected_rms = calculate_rms(test_data_copy); + EXPECT_NEAR(rms, expected_rms, 0.01f); } -// 测试 limit_audio 函数 -TEST_F(AudioProcessingTest, LimitAudioBasic) { - const size_t num_samples = 1024; - const float threshold = 0.5f; - auto src = data_gen.generate_sine_wave(num_samples, 440.0f, 44100.0f, 1.0f); - aligned_audio_buffer result(num_samples); - float limiter_state = 1.0f; - - // 测试标量版本 - scalar_audio_processing_func::limit_audio(src.data(), result.data(), threshold, &limiter_state, 44100.f, - num_samples); - - // 验证没有样本超过阈值 - bool all_samples_limited = true; - size_t violation_index = 0; - float violation_value = 0.0f; - - for (size_t i = 0; i < num_samples; ++i) { - if (std::abs(result[i]) > threshold + FLOAT_TOLERANCE) { - all_samples_limited = false; - violation_index = i; - violation_value = result[i]; - break; - } - } - - EXPECT_TRUE(all_samples_limited) - << "样本 " << violation_index << " 超过阈值: " << violation_value; +TEST_F(AdditionalFunctionalityTest, PeakCalculation) { + std::vector src(test_data_.begin(), test_data_.end()); + + // 使用SIMD实现计算峰值 + float peak = simd::calculate_peak(src.data(), 1024); + + // 验证峰值在合理范围内 + EXPECT_GT(peak, 0.0); + EXPECT_LT(peak, 1.1f); // 允许一些误差 + + // 验证与参考实现的一致性 + std::vector test_data_copy(test_data_.begin(), test_data_.end()); + float expected_peak = calculate_peak(test_data_copy); + EXPECT_NEAR(peak, expected_peak, 0.01f); } -// 测试 fade_audio 函数 -TEST_F(AudioProcessingTest, FadeAudioBasic) { - const size_t num_samples = 1024; - const size_t fade_in_samples = 100; - const size_t fade_out_samples = 100; - auto src = data_gen.generate_dc(num_samples, 1.0f); - aligned_audio_buffer result(num_samples); - - // 测试标量版本 - scalar_audio_processing_func::fade_audio(src.data(), result.data(), fade_in_samples, fade_out_samples, num_samples); - - // 验证淡入淡出效果 - - // 检查淡入部分 - EXPECT_FLOAT_EQ(result[0], 0.0f) - << "淡入开始应为0,实际为: " << result[0]; - - // 检查中间部分(应该是1.0) - EXPECT_FLOAT_EQ(result[num_samples / 2], 1.0f) - << "中间部分应保持原始值1.0,实际为: " << result[num_samples / 2]; - - // 检查淡出部分 - EXPECT_TRUE(float_equal(result[num_samples - 1], 0.0f, FLOAT_TOLERANCE)) - << "淡出结束应为0,实际为: " << result[num_samples - 1]; +TEST_F(AdditionalFunctionalityTest, StereoToMono) { + // 创建交错的立体声测试数据 (LRLRLR...) + std::vector stereo_interleaved(1024); + std::vector mono_result(512); + + // 填充交错数据:左声道=1.0,右声道=2.0 + for (size_t i = 0; i < 512; ++i) { + stereo_interleaved[i * 2] = 1.0f; // 左声道 + stereo_interleaved[i * 2 + 1] = 2.0f; // 右声道 + } + + // 转换为单声道 + simd::stereo_to_mono(stereo_interleaved.data(), mono_result.data(), 512); + + // 验证结果为平均值 (1.0 + 2.0) / 2 = 1.5 + for (size_t i = 0; i < 512; ++i) { + EXPECT_FLOAT_EQ(mono_result[i], 1.5f); + } } -// 测试 simple_eq 函数 -TEST_F(AudioProcessingTest, SimpleEqBasic) { - const size_t num_samples = 1024; - const float low_gain = 1.2f; - const float mid_gain = 1.0f; - const float high_gain = 0.8f; - auto src = data_gen.generate_white_noise(num_samples, 0.5f); - aligned_audio_buffer result(num_samples); - aligned_audio_buffer eq_state(2, 0.0f); // 低通和高通滤波器状态 - - // 测试标量版本 - scalar_audio_processing_func::simple_eq(src.data(), result.data(), low_gain, mid_gain, high_gain, eq_state.data(), - num_samples); - - // 基本验证:结果不应该全为零(除非输入全为零) - bool has_nonzero = false; - for (size_t i = 0; i < num_samples; ++i) { - if (result[i] != 0.0f) { - has_nonzero = true; - break; - } - } - - EXPECT_TRUE(has_nonzero) - << "EQ处理后的结果全为零,可能存在处理错误"; -} - -/** - * ======================================== - * 边界条件测试 - * ======================================== - */ - -// 测试零长度输入 -TEST_F(AudioProcessingTest, ZeroLengthInput) { - const size_t num_samples = 0; - aligned_audio_buffer dummy(1, 0.0f); - aligned_audio_buffer result(1, 0.0f); - float state = 1.0f; - - // 这些函数应该能安全处理零长度输入 - EXPECT_NO_THROW({ - scalar_audio_processing_func::mix_audio(dummy.data(), dummy.data(), result.data(), num_samples); - scalar_audio_processing_func::apply_gain(dummy.data(), result.data(), 1.0f, num_samples); - scalar_audio_processing_func::normalize_audio(dummy.data(), result.data(), 1.0f, num_samples); - scalar_audio_processing_func::stereo_to_mono(dummy.data(), result.data(), num_samples); - scalar_audio_processing_func::limit_audio(dummy.data(), result.data(), 1.0f, &state, 44100.f, num_samples); - scalar_audio_processing_func::fade_audio(dummy.data(), result.data(), 0, 0, num_samples); - scalar_audio_processing_func::simple_eq(dummy.data(), result.data(), 1.0f, 1.0f, 1.0f, &state, num_samples); - }); -} - -// 测试单样本输入 -TEST_F(AudioProcessingTest, SingleSampleInput) { - const size_t num_samples = 1; - aligned_audio_buffer src1{0.5f}; - aligned_audio_buffer src2{0.3f}; - aligned_audio_buffer result(1); - float state = 1.0f; - - // 测试混合 - scalar_audio_processing_func::mix_audio(src1.data(), src2.data(), result.data(), num_samples); - EXPECT_TRUE(float_equal(result[0], 0.8f)) - << "混合单样本: 期望0.8,实际" << result[0]; - - // 测试增益 - scalar_audio_processing_func::apply_gain(src1.data(), result.data(), 2.0f, num_samples); - EXPECT_TRUE(float_equal(result[0], 1.0f)) - << "应用增益: 期望1.0,实际" << result[0]; - - // 测试RMS - float rms = scalar_audio_processing_func::calculate_rms(src1.data(), num_samples); - EXPECT_TRUE(float_equal(rms, 0.5f)) - << "单样本RMS: 期望0.5,实际" << rms; - - // 测试峰值 - float peak = scalar_audio_processing_func::calculate_peak(src1.data(), num_samples); - EXPECT_TRUE(float_equal(peak, 0.5f)) - << "单样本峰值: 期望0.5,实际" << peak; -} - -// 测试极值处理 -TEST_F(AudioProcessingTest, ExtremeValues) { - const size_t num_samples = 8; - aligned_audio_buffer extreme_values = { - 0.0f, - 1.0f, - -1.0f, - std::numeric_limits::max(), - -std::numeric_limits::max(), - std::numeric_limits::min(), - std::numeric_limits::epsilon(), - -std::numeric_limits::epsilon() - }; - - aligned_audio_buffer result(num_samples); - - // 测试峰值计算对极值的处理 - float peak = scalar_audio_processing_func::calculate_peak(extreme_values.data(), num_samples); - EXPECT_EQ(peak, std::numeric_limits::max()) - << "极值峰值检测失败,期望" << std::numeric_limits::max() << ",实际" << peak; - - // 测试增益对极值的处理 - scalar_audio_processing_func::apply_gain(extreme_values.data(), result.data(), 0.5f, num_samples); - bool all_finite = true; - size_t nan_inf_index = 0; - - for (size_t i = 0; i < num_samples; ++i) { - if (std::isnan(result[i]) || std::isinf(result[i])) { - all_finite = false; - nan_inf_index = i; - break; - } - } - - EXPECT_TRUE(all_finite) - << "增益处理后在位置" << nan_inf_index << "存在NaN或Inf"; -} - -/** - * ======================================== - * 一致性测试(标量 vs SIMD) - * ======================================== - */ - -#if ALICHO_PLATFORM_X86 - -// 测试x86 SIMD版本与标量版的一致性 -TEST_F(AudioProcessingTest, X86SimdConsistency) { - const size_t num_samples = 1024; - auto src1 = data_gen.generate_sine_wave(num_samples, 440.0f); - auto src2 = data_gen.generate_sine_wave(num_samples, 880.0f); - auto stereo_src = data_gen.generate_stereo_test_data(num_samples); - - aligned_audio_buffer scalar_result(num_samples); - aligned_audio_buffer sse_result(num_samples); - aligned_audio_buffer avx_result(num_samples); - std::vector> avx512_result(num_samples); - - // 测试 mix_audio 一致性 - scalar_audio_processing_func::mix_audio(src1.data(), src2.data(), scalar_result.data(), num_samples); - x86_simd_audio_processing_func::mix_audio_sse(src1.data(), src2.data(), sse_result.data(), num_samples); - x86_simd_audio_processing_func::mix_audio_avx(src1.data(), src2.data(), avx_result.data(), num_samples); - x86_simd_audio_processing_func::mix_audio_avx512(src1.data(), src2.data(), avx512_result.data(), num_samples); - - EXPECT_TRUE(arrays_equal(scalar_result.data(), sse_result.data(), num_samples)) - << "mix_audio SSE版本与标量版本不一致"; - EXPECT_TRUE(arrays_equal(scalar_result.data(), avx_result.data(), num_samples)) - << "mix_audio AVX版本与标量版本不一致"; - EXPECT_TRUE(arrays_equal(scalar_result.data(), avx512_result.data(), num_samples)) - << "mix_audio AVX512版本与标量版本不一致"; - - // 测试 apply_gain 一致性 - const float gain = 0.75f; - scalar_audio_processing_func::apply_gain(src1.data(), scalar_result.data(), gain, num_samples); - x86_simd_audio_processing_func::apply_gain_sse(src1.data(), sse_result.data(), gain, num_samples); - x86_simd_audio_processing_func::apply_gain_avx(src1.data(), avx_result.data(), gain, num_samples); - x86_simd_audio_processing_func::apply_gain_avx512(src1.data(), avx512_result.data(), gain, num_samples); - - EXPECT_TRUE(arrays_equal(scalar_result.data(), sse_result.data(), num_samples)) - << "apply_gain SSE版本与标量版本不一致"; - EXPECT_TRUE(arrays_equal(scalar_result.data(), avx_result.data(), num_samples)) - << "apply_gain AVX版本与标量版本不一致"; - EXPECT_TRUE(arrays_equal(scalar_result.data(), avx512_result.data(), num_samples)) - << "apply_gain AVX512版本与标量版本不一致"; - - // 测试 calculate_rms 一致性 - float scalar_rms = scalar_audio_processing_func::calculate_rms(src1.data(), num_samples); - float sse_rms = x86_simd_audio_processing_func::calculate_rms_sse(src1.data(), num_samples); - float avx_rms = x86_simd_audio_processing_func::calculate_rms_avx(src1.data(), num_samples); - float avx512_rms = x86_simd_audio_processing_func::calculate_rms_avx512(src1.data(), num_samples); - - EXPECT_TRUE(float_equal(scalar_rms, sse_rms, RMS_TOLERANCE)) - << "calculate_rms SSE版本与标量版本不一致: " << scalar_rms << " vs " << sse_rms; - EXPECT_TRUE(float_equal(scalar_rms, avx_rms, RMS_TOLERANCE)) - << "calculate_rms AVX版本与标量版本不一致: " << scalar_rms << " vs " << avx_rms; - EXPECT_TRUE(float_equal(scalar_rms, avx512_rms, RMS_TOLERANCE)) - << "calculate_rms AVX512版本与标量版本不一致: " << scalar_rms << " vs " << avx512_rms; - - // 测试 calculate_peak 一致性 - float scalar_peak = scalar_audio_processing_func::calculate_peak(src1.data(), num_samples); - float sse_peak = x86_simd_audio_processing_func::calculate_peak_sse(src1.data(), num_samples); - float avx_peak = x86_simd_audio_processing_func::calculate_peak_avx(src1.data(), num_samples); - float avx512_peak = x86_simd_audio_processing_func::calculate_peak_avx512(src1.data(), num_samples); - - EXPECT_TRUE(float_equal(scalar_peak, sse_peak, PEAK_TOLERANCE)) - << "calculate_peak SSE版本与标量版本不一致: " << scalar_peak << " vs " << sse_peak; - EXPECT_TRUE(float_equal(scalar_peak, avx_peak, PEAK_TOLERANCE)) - << "calculate_peak AVX版本与标量版本不一致: " << scalar_peak << " vs " << avx_peak; - EXPECT_TRUE(float_equal(scalar_peak, avx512_peak, PEAK_TOLERANCE)) - << "calculate_peak AVX512版本与标量版本不一致: " << scalar_peak << " vs " << avx512_peak; -} - -#endif // ALICHO_PLATFORM_X86 - -#if ALICHO_PLATFORM_ARM - -// 测试ARM NEON版本与标量版的一致性 -TEST_F(AudioProcessingTest, ArmSimdConsistency) { - const size_t num_samples = 1024; - auto src1 = data_gen.generate_sine_wave(num_samples, 440.0f); - auto src2 = data_gen.generate_sine_wave(num_samples, 880.0f); - - aligned_audio_buffer scalar_result(num_samples); - aligned_audio_buffer neon_result(num_samples); - - // 测试 mix_audio 一致性 - scalar_audio_processing_func::mix_audio(src1.data(), src2.data(), scalar_result.data(), num_samples); - arm_simd_audio_processing_func::mix_audio_neon(src1.data(), src2.data(), neon_result.data(), num_samples); - - EXPECT_TRUE(arrays_equal(scalar_result.data(), neon_result.data(), num_samples)) - << "mix_audio NEON版本与标量版不一致"; - - // 测试 apply_gain 一致性 - const float gain = 0.75f; - scalar_audio_processing_func::apply_gain(src1.data(), scalar_result.data(), gain, num_samples); - arm_simd_audio_processing_func::apply_gain_neon(src1.data(), neon_result.data(), gain, num_samples); - - EXPECT_TRUE(arrays_equal(scalar_result.data(), neon_result.data(), num_samples)) - << "apply_gain NEON版本与标量版不一致"; - - // 测试 calculate_rms 一致性 - float scalar_rms = scalar_audio_processing_func::calculate_rms(src1.data(), num_samples); - float neon_rms = arm_simd_audio_processing_func::calculate_rms_neon(src1.data(), num_samples); - - EXPECT_TRUE(float_equal(scalar_rms, neon_rms, RMS_TOLERANCE)) - << "calculate_rms NEON版本与标量版不一致: " << scalar_rms << " vs " << neon_rms; - - // 测试 calculate_peak 一致性 - float scalar_peak = scalar_audio_processing_func::calculate_peak(src1.data(), num_samples); - float neon_peak = arm_simd_audio_processing_func::calculate_peak_neon(src1.data(), num_samples); - - EXPECT_TRUE(float_equal(scalar_peak, neon_peak, PEAK_TOLERANCE)) - << "calculate_peak NEON版本与标量版不一致: " << scalar_peak << " vs " << neon_peak; -} - -#endif // ALICHO_PLATFORM_ARM - -/** - * ======================================== - * 性能测试 - * ======================================== - */ - -// 性能测试运行为 TEST_F 测试,但不使用 EXPECT/ASSERT -TEST_F(AudioProcessingTest, PerformanceTests) { - std::cout << "\n=== 性能测试 ===" << std::endl; - - // 生成大量测试数据 - auto src1 = data_gen.generate_sine_wave(PERF_TEST_SIZE, 440.0f); - auto src2 = data_gen.generate_sine_wave(PERF_TEST_SIZE, 880.0f); - aligned_audio_buffer result(PERF_TEST_SIZE); - - // 性能测试:mix_audio - { - double scalar_time = perf_tester.measure_execution_time([&]() { - scalar_audio_processing_func::mix_audio(src1.data(), src2.data(), result.data(), PERF_TEST_SIZE); - }); - - #if ALICHO_PLATFORM_X86 - double sse_time = perf_tester.measure_execution_time([&]() { - x86_simd_audio_processing_func::mix_audio_sse(src1.data(), src2.data(), result.data(), PERF_TEST_SIZE); - }); - perf_tester.print_performance_comparison("mix_audio (SSE vs Scalar)", scalar_time, sse_time); - - double avx_time = perf_tester.measure_execution_time([&]() { - x86_simd_audio_processing_func::mix_audio_avx(src1.data(), src2.data(), result.data(), PERF_TEST_SIZE); - }); - perf_tester.print_performance_comparison("mix_audio (AVX vs Scalar)", scalar_time, avx_time); - - double avx512_time = perf_tester.measure_execution_time([&]() { - x86_simd_audio_processing_func::mix_audio_avx512(src1.data(), src2.data(), result.data(), PERF_TEST_SIZE); - }); - perf_tester.print_performance_comparison("mix_audio (AVX512 vs Scalar)", scalar_time, avx512_time); - #endif - - #if ALICHO_PLATFORM_ARM - double neon_time = perf_tester.measure_execution_time([&]() { - arm_simd_audio_processing_func::mix_audio_neon(src1.data(), src2.data(), result.data(), PERF_TEST_SIZE); - }); - perf_tester.print_performance_comparison("mix_audio (NEON vs Scalar)", scalar_time, neon_time); - #endif - } - - // 性能测试:apply_gain - { - const float gain = 0.5f; - double scalar_time = perf_tester.measure_execution_time([&]() { - scalar_audio_processing_func::apply_gain(src1.data(), result.data(), gain, PERF_TEST_SIZE); - }); - - #if ALICHO_PLATFORM_X86 - double sse_time = perf_tester.measure_execution_time([&]() { - x86_simd_audio_processing_func::apply_gain_sse(src1.data(), result.data(), gain, PERF_TEST_SIZE); - }); - perf_tester.print_performance_comparison("apply_gain (SSE vs Scalar)", scalar_time, sse_time); - - double avx_time = perf_tester.measure_execution_time([&]() { - x86_simd_audio_processing_func::apply_gain_avx(src1.data(), result.data(), gain, PERF_TEST_SIZE); - }); - perf_tester.print_performance_comparison("apply_gain (AVX vs Scalar)", scalar_time, avx_time); - - double avx512_time = perf_tester.measure_execution_time([&]() { - x86_simd_audio_processing_func::apply_gain_avx512(src1.data(), result.data(), gain, PERF_TEST_SIZE); - }); - perf_tester.print_performance_comparison("apply_gain (AVX512 vs Scalar)", scalar_time, avx512_time); - #endif - - #if ALICHO_PLATFORM_ARM - double neon_time = perf_tester.measure_execution_time([&]() { - arm_simd_audio_processing_func::apply_gain_neon(src1.data(), result.data(), gain, PERF_TEST_SIZE); - }); - perf_tester.print_performance_comparison("apply_gain (NEON vs Scalar)", scalar_time, neon_time); - #endif - } - - // 性能测试:calculate_rms - { - double scalar_time = perf_tester.measure_execution_time([&]() { - scalar_audio_processing_func::calculate_rms(src1.data(), PERF_TEST_SIZE); - }); - - #if ALICHO_PLATFORM_X86 - double sse_time = perf_tester.measure_execution_time([&]() { - x86_simd_audio_processing_func::calculate_rms_sse(src1.data(), PERF_TEST_SIZE); - }); - perf_tester.print_performance_comparison("calculate_rms (SSE vs Scalar)", scalar_time, sse_time); - - double avx_time = perf_tester.measure_execution_time([&]() { - x86_simd_audio_processing_func::calculate_rms_avx(src1.data(), PERF_TEST_SIZE); - }); - perf_tester.print_performance_comparison("calculate_rms (AVX vs Scalar)", scalar_time, avx_time); - - double avx512_time = perf_tester.measure_execution_time([&]() { - x86_simd_audio_processing_func::calculate_rms_avx512(src1.data(), PERF_TEST_SIZE); - }); - perf_tester.print_performance_comparison("calculate_rms (AVX512 vs Scalar)", scalar_time, avx512_time); - #endif - - #if ALICHO_PLATFORM_ARM - double neon_time = perf_tester.measure_execution_time([&]() { - arm_simd_audio_processing_func::calculate_rms_neon(src1.data(), PERF_TEST_SIZE); - }); - perf_tester.print_performance_comparison("calculate_rms (NEON vs Scalar)", scalar_time, neon_time); - #endif - } - - // 性能测试:calculate_peak - { - double scalar_time = perf_tester.measure_execution_time([&]() { - scalar_audio_processing_func::calculate_peak(src1.data(), PERF_TEST_SIZE); - }); - - #if ALICHO_PLATFORM_X86 - double sse_time = perf_tester.measure_execution_time([&]() { - x86_simd_audio_processing_func::calculate_peak_sse(src1.data(), PERF_TEST_SIZE); - }); - perf_tester.print_performance_comparison("calculate_peak (SSE vs Scalar)", scalar_time, sse_time); - - double avx_time = perf_tester.measure_execution_time([&]() { - x86_simd_audio_processing_func::calculate_peak_avx(src1.data(), PERF_TEST_SIZE); - }); - perf_tester.print_performance_comparison("calculate_peak (AVX vs Scalar)", scalar_time, avx_time); - - double avx512_time = perf_tester.measure_execution_time([&]() { - x86_simd_audio_processing_func::calculate_peak_avx512(src1.data(), PERF_TEST_SIZE); - }); - perf_tester.print_performance_comparison("calculate_peak (AVX512 vs Scalar)", scalar_time, avx512_time); - #endif - - #if ALICHO_PLATFORM_ARM - double neon_time = perf_tester.measure_execution_time([&]() { - arm_simd_audio_processing_func::calculate_peak_neon(src1.data(), PERF_TEST_SIZE); - }); - perf_tester.print_performance_comparison("calculate_peak (NEON vs Scalar)", scalar_time, neon_time); - #endif - } -} +} // namespace diff --git a/tests/simd/test_simd_basic.cpp b/tests/simd/test_simd_basic.cpp index 75fdfba..33bb99f 100644 --- a/tests/simd/test_simd_basic.cpp +++ b/tests/simd/test_simd_basic.cpp @@ -1,1273 +1,179 @@ #include -#include +#include "simd_func_dispatcher.h" +#include "simd_interface.h" +#include "lib_handle.h" +#include "cpu_features.h" +#include "aligned_allocator.h" +#include "simd_api.h" #include -#include -#include -#include -#include -#include #include -#include "cpu_features.h" -#include "simd_func_dispatcher.h" -#include "aligned_allocator.h" +// 定义测试用的函数指针类型 +using simd_func_ptr = void (*)(const float*, const float*, float*, size_t); -// ============================================================================= -// 测试辅助函数和宏定义 -// ============================================================================= +/** + * @brief 测试 SimdFuncDispatcher 是否能根据CPU特性自动加载动态库 + * + * 这个测试通过模拟 SimdFuncDispatcher 的核心加载逻辑来验证其功能。 + * + * 测试步骤: + * 1. 检测当前CPU支持的最佳SIMD级别。 + * 2. 根据SIMD级别和操作系统平台,构造出预期的动态库文件名。 + * (这模拟了 SimdFuncDispatcher 构造函数中的逻辑) + * 3. 使用 lib_handle 手动加载这个动态库。 + * 4. 从加载的库中获取 "add" 和 "subtract" 函数的指针。 + * 5. 验证获取到的函数指针是否有效(非空)。 + * 6. 准备测试数据并调用函数指针,验证其功能正确性。 + * + * 这个测试间接验证了: + * - SimdFuncDispatcher 的库选择逻辑是正确的。 + * - 对应于当前CPU的SIMD动态库是存在的并且可以被加载。 + * - 库中导出了正确的 "add" 和 "subtract" 函数。 + */ +TEST(SimdFuncDispatcherTest, ShouldLoadCorrectLibraryBasedOnCpuFeatures) { + const auto& detector = cpu_feature_detector::instance(); + std::string lib_name; -// 跨平台兼容性宏 -#ifndef ALICHO_PLATFORM_WINDOWS -#define ALICHO_PLATFORM_WINDOWS 0 -#endif + // 模拟 SimdFuncDispatcher 的库选择逻辑 + if (detector.supports(cpu_feature::AVX512F)) { + #if ALICHO_PLATFORM_WINDOWS + lib_name = "alicho_simd_avx512.dll"; + #elif ALICHO_PLATFORM_LINUX + lib_name = "./libalicho_simd_avx512.so"; + #elif ALICHO_PLATFORM_APPLE + lib_name = "./libalicho_simd_avx512.dylib"; + #endif + } else if (detector.supports(cpu_feature::AVX)) { + #if ALICHO_PLATFORM_WINDOWS + lib_name = "alicho_simd_avx.dll"; + #elif ALICHO_PLATFORM_LINUX + lib_name = "./libalicho_simd_avx.so"; + #elif ALICHO_PLATFORM_APPLE + lib_name = "./libalicho_simd_avx.dylib"; + #endif + } else if (detector.supports(cpu_feature::SSE)) { + #if ALICHO_PLATFORM_WINDOWS + lib_name = "alicho_simd_sse.dll"; + #elif ALICHO_PLATFORM_LINUX + lib_name = "./libalicho_simd_sse.so"; + #elif ALICHO_PLATFORM_APPLE + lib_name = "./libalicho_simd_sse.dylib"; + #endif + } else { + #if ALICHO_PLATFORM_WINDOWS + lib_name = "alicho_simd_scaler.dll"; + #elif ALICHO_PLATFORM_LINUX + lib_name = "./libalicho_simd_scaler.so"; + #elif ALICHO_PLATFORM_APPLE + lib_name = "./libalicho_simd_scaler.dylib"; + #endif + } -#ifndef ALICHO_PLATFORM_X86 -#define ALICHO_PLATFORM_X86 1 -#endif + ASSERT_FALSE(lib_name.empty()) << "Could not determine the SIMD library name for the current CPU."; -#ifndef ALICHO_PLATFORM_ARM -#define ALICHO_PLATFORM_ARM 0 -#endif + lib_handle handle; + ASSERT_TRUE(handle.open(lib_name)) << "Failed to open SIMD library: " << lib_name; -#ifndef ALICHO_PLATFORM_POSIX -#define ALICHO_PLATFORM_POSIX 0 -#endif + auto fill_func = get_function_by_func_signature(handle, fill_buffer); + auto mix_func = get_function_by_func_signature(handle, mix_audio); -#ifndef ALICHO_PLATFORM_UNIX -#define ALICHO_PLATFORM_UNIX 0 -#endif + ASSERT_NE(fill_func, nullptr) << "Failed to load 'fill_buffer' function from " << lib_name; + ASSERT_NE(mix_func, nullptr) << "Failed to load 'mix_audio' function from " << lib_name; -// 测试辅助函数 -namespace simd_test_helpers { - // 简单的性能计时器 - class timer { - public: - timer() : start_(std::chrono::high_resolution_clock::now()) { - } + // 准备测试数据 + constexpr size_t num_samples = 1024; + std::vector src1(num_samples); + std::vector src2(num_samples); + std::vector dst_fill(num_samples, 0.0f); + std::vector dst_mix(num_samples, 0.0f); - auto elapsed_ms() const -> double { - auto end = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast(end - start_); - return duration.count() / 1000.0; - } + std::iota(src1.begin(), src1.end(), 0.0f); + std::iota(src2.begin(), src2.end(), static_cast(num_samples)); - private: - std::chrono::high_resolution_clock::time_point start_; - }; + // 调用加载的函数 + fill_func(dst_fill.data(), 1.0f, num_samples); // 初始化为0 + mix_func(src1.data(), src2.data(), dst_mix.data(), num_samples); // 执行加法 - // 测试用的简单数学函数 - auto add_scalar(float a, float b) -> float { return a + b; } - auto add_sse(float a, float b) -> float { return a + b + 0.1f; } // 模拟SSE版本 - auto add_avx(float a, float b) -> float { return a + b + 0.2f; } // 模拟AVX版本 - - // 测试用的数组求和函数 - auto sum_array_scalar(const std::vector& arr) -> float { - float sum = 0.0f; - for (const auto& val : arr) { - sum += val; - } - return sum; - } - - auto sum_array_sse(const std::vector& arr) -> float { - // 模拟SSE实现 - return sum_array_scalar(arr) * 1.01f; - } - - auto sum_array_avx(const std::vector& arr) -> float { - // 模拟AVX实现 - return sum_array_scalar(arr) * 1.02f; - } - - // 检查指针是否正确对齐 - template - auto is_properly_aligned(void* ptr) -> bool { - return (reinterpret_cast(ptr) % alignment) == 0; - } - - // 生成测试数据 - auto generate_test_data(size_t size) -> std::vector { - std::vector data; - data.reserve(size); - for (size_t i = 0; i < size; ++i) { - data.push_back(static_cast(i) * 0.1f); - } - return data; - } + // 验证 fill_buffer 函数 + for (size_t i = 0; i < num_samples; ++i) { + ASSERT_EQ(dst_fill[i], 1.0f) << "fill_buffer function did not work correctly at index " << i; + } + // 验证 mix_audio 函数 + for (size_t i = 0; i < num_samples; ++i) { + ASSERT_EQ(dst_mix[i], src1[i] + src2[i]) << "mix_audio function did not work correctly at index " << i; + } } -// ============================================================================= -// 主测试类 -// ============================================================================= -class simd_test : public ::testing::Test { -protected: - void SetUp() override { - // 获取CPU信息用于后续测试 - cpu_info_ = &get_cpu_info(); - } +// ==================================================================== +// 测试 simd_func_dispatcher 和 simd_api 的功能 +// ==================================================================== - void TearDown() override { - // 清理测试环境 - } - - const cpu_info* cpu_info_ = nullptr; -}; - -// ============================================================================= -// CPU特性检测模块测试(9个测试用例) -// ============================================================================= - -// 基础功能测试 -TEST_F(simd_test, CpuFeaturesTest_BasicDetection) { - ASSERT_NE(cpu_info_, nullptr); - - // 基本信息应该已填充 - EXPECT_FALSE(cpu_info_->vendor.empty()); - EXPECT_FALSE(cpu_info_->brand.empty()); - EXPECT_GT(cpu_info_->logical_cores, 0); - EXPECT_GT(cpu_info_->physical_cores, 0); - - // 特性字符串应该可以生成 - auto features_str = cpu_info_->features_string(); - EXPECT_TRUE(features_str.empty() || !features_str.empty()); // 总是为真,但测试调用成功 - - std::cout << "CPU 厂商: " << cpu_info_->vendor << std::endl; - std::cout << "CPU 型号: " << cpu_info_->brand << std::endl; - std::cout << "逻辑核心数: " << cpu_info_->logical_cores << std::endl; - std::cout << "物理核心数: " << cpu_info_->physical_cores << std::endl; - std::cout << "特性: " << features_str << std::endl; +TEST(SimdDispatcher, AutoVersionSelection) { + // 测试自动版本选择 + auto& dispatcher = simd_func_dispatcher::instance(); + auto version = dispatcher.get_active_version(); + + // 版本应该不是 COUNT(无效值) + EXPECT_NE(version, simd_func_version::COUNT); + + // 所有函数指针都应该非空 + EXPECT_NE(dispatcher.get_fill_buffer(), nullptr); + EXPECT_NE(dispatcher.get_mix_audio(), nullptr); + EXPECT_NE(dispatcher.get_apply_gain(), nullptr); + EXPECT_NE(dispatcher.get_calculate_rms(), nullptr); + EXPECT_NE(dispatcher.get_calculate_peak(), nullptr); + + // 打印当前使用的版本 + auto version_str = simd::get_active_simd_version_string(); + std::cout << "当前SIMD版本: " << version_str << std::endl; } -TEST_F(simd_test, CpuFeaturesTest_SimdLevelDetection) { - auto max_level = get_max_simd_level(); - auto recommended_level = get_recommended_simd_level(); - - // SIMD级别应该在有效范围内 - EXPECT_GE(static_cast(max_level), static_cast(simd_level::NONE)); - EXPECT_LE(static_cast(max_level), static_cast(simd_level::NEON_FP16)); - - EXPECT_GE(static_cast(recommended_level), static_cast(simd_level::NONE)); - EXPECT_LE(static_cast(recommended_level), static_cast(simd_level::NEON_FP16)); - - // 推荐级别不应该超过最大级别 - EXPECT_LE(static_cast(recommended_level), static_cast(max_level)); - - std::cout << "最大 SIMD 级别: " << static_cast(max_level) << std::endl; - std::cout << "推荐 SIMD 级别: " << static_cast(recommended_level) << std::endl; +TEST(SimdAPI, FillBuffer) { + // 测试 fill_buffer API - 使用对齐的分配器 + std::vector buffer(1024, 0.0f); + + simd::fill_buffer(buffer.data(), 1.0f, buffer.size()); + + // 验证所有元素都被填充为 1.0 + for (size_t i = 0; i < buffer.size(); ++i) { + EXPECT_FLOAT_EQ(buffer[i], 1.0f) << "索引 " << i << " 的值不正确"; + } } -TEST_F(simd_test, CpuFeaturesTest_GlobalFunctions) { - // 测试全局便利函数 - const auto& info = get_cpu_info(); - EXPECT_EQ(&info, cpu_info_); - - // 测试特性检查函数 - auto sse_supported = cpu_supports(cpu_feature::SSE); - auto sse2_supported = cpu_supports(cpu_feature::SSE2); - - // 如果支持SSE2,应该也支持SSE - if (sse2_supported) { - EXPECT_TRUE(sse_supported); - } - - // 测试级别检查 - auto detector = &cpu_feature_detector::instance(); - EXPECT_EQ(detector->max_simd_level(), info.max_simd_level); - - // 验证支持级别检查逻辑 - EXPECT_TRUE(detector->supports_at_least(simd_level::NONE)); - - if (info.max_simd_level >= simd_level::SSE) { - EXPECT_TRUE(detector->supports_at_least(simd_level::SSE)); - } +TEST(SimdAPI, CalculateRMS) { + // 测试 calculate_rms API - 使用对齐的分配器 + std::vector buffer(1024, 1.0f); + + float rms = simd::calculate_rms(buffer.data(), buffer.size()); + + // 所有值为1.0的RMS应该是1.0 + EXPECT_NEAR(rms, 1.0f, 0.001f); } -// 平台兼容性测试 -TEST_F(simd_test, CpuFeaturesTest_X86PlatformSupport) { - #if ALICHO_PLATFORM_X86 - // 在x86平台上,至少应该支持SSE - EXPECT_TRUE(cpu_supports(cpu_feature::SSE) || cpu_supports(cpu_feature::SSE2)); - - // 检查常见的x86特性 - std::vector x86_features = { - cpu_feature::SSE, cpu_feature::SSE2, cpu_feature::SSE3, - cpu_feature::AVX, cpu_feature::AVX2, cpu_feature::FMA - }; - - bool has_any_x86_feature = false; - for (auto feature : x86_features) { - if (cpu_supports(feature)) { - has_any_x86_feature = true; - break; - } - } - EXPECT_TRUE(has_any_x86_feature); - #else - GTEST_SKIP() << "Not x86 platform"; - #endif +TEST(SimdAPI, MixAudio) { + // 测试 mix_audio API - 使用对齐的分配器 + std::vector src1(1024, 1.0f); + std::vector src2(1024, 2.0f); + std::vector dst(1024, 0.0f); + + simd::mix_audio(src1.data(), src2.data(), dst.data(), dst.size()); + + // 验证混音结果 (1.0 + 2.0 = 3.0) + for (size_t i = 0; i < dst.size(); ++i) { + EXPECT_FLOAT_EQ(dst[i], 3.0f) << "索引 " << i << " 的混音结果不正确"; + } } -TEST_F(simd_test, CpuFeaturesTest_ArmPlatformSupport) { - #if ALICHO_PLATFORM_ARM - // 在ARM平台上,可能支持NEON - bool has_neon = cpu_supports(cpu_feature::NEON); - bool has_neon_fp16 = cpu_supports(cpu_feature::NEON_FP16); - - // 如果支持FP16,应该也支持基础NEON - if (has_neon_fp16) { - EXPECT_TRUE(has_neon); - } - - // 检查SIMD级别 - auto max_level = get_max_simd_level(); - if (has_neon) { - EXPECT_GE(static_cast(max_level), static_cast(simd_level::NEON)); - } - #else - GTEST_SKIP() << "Not ARM platform"; - #endif +TEST(SimdAPI, ApplyGain) { + // 测试 apply_gain API - 使用对齐的分配器 + std::vector src(1024, 2.0f); + std::vector dst(1024, 0.0f); + + simd::apply_gain(src.data(), dst.data(), 0.5f, dst.size()); + + // 验证增益应用 (2.0 * 0.5 = 1.0) + for (size_t i = 0; i < dst.size(); ++i) { + EXPECT_FLOAT_EQ(dst[i], 1.0f) << "索引 " << i << " 的增益结果不正确"; + } } - -TEST_F(simd_test, CpuFeaturesTest_CrossPlatformConsistency) { - // 跨平台一致性检查 - auto detector = &cpu_feature_detector::instance(); - - // 单例应该总是返回相同的实例 - EXPECT_EQ(detector, &cpu_feature_detector::instance()); - - // 多次调用应该返回相同的结果 - auto level1 = get_max_simd_level(); - auto level2 = get_max_simd_level(); - EXPECT_EQ(level1, level2); - - auto recommended1 = get_recommended_simd_level(); - auto recommended2 = get_recommended_simd_level(); - EXPECT_EQ(recommended1, recommended2); - - // 特性检测应该一致 - auto sse_check1 = cpu_supports(cpu_feature::SSE); - auto sse_check2 = cpu_supports(cpu_feature::SSE); - EXPECT_EQ(sse_check1, sse_check2); -} - -// SIMD级别推荐测试 -TEST_F(simd_test, CpuFeaturesTest_SimdLevelRecommendation) { - auto max_level = get_max_simd_level(); - auto recommended_level = get_recommended_simd_level(); - - // 推荐算法的合理性检查 - switch (max_level) { - case simd_level::NONE: - EXPECT_EQ(recommended_level, simd_level::NONE); - break; - case simd_level::SSE: - case simd_level::SSE3: - case simd_level::SSE4: - case simd_level::AVX: - case simd_level::AVX2: - // 对于这些级别,推荐级别应该等于最大级别 - EXPECT_EQ(recommended_level, max_level); - break; - case simd_level::AVX512: - // AVX512可能会回退到AVX2以确保兼容性 - EXPECT_TRUE(recommended_level == simd_level::AVX512 || - recommended_level == simd_level::AVX2); - break; - case simd_level::NEON: - case simd_level::NEON_FP16: - EXPECT_EQ(recommended_level, max_level); - break; - } -} - -TEST_F(simd_test, CpuFeaturesTest_PerformanceGuidedSelection) { - // 测试性能引导的SIMD级别选择 - auto recommended = get_recommended_simd_level(); - auto max_level = get_max_simd_level(); - - // 推荐级别应该考虑性能和兼容性 - EXPECT_LE(static_cast(recommended), static_cast(max_level)); - - // 在AVX512的情况下,验证特殊逻辑 - if (max_level == simd_level::AVX512) { - bool has_avx512f = cpu_supports(cpu_feature::AVX512F); - bool has_avx512vl = cpu_supports(cpu_feature::AVX512VL); - bool has_avx512bw = cpu_supports(cpu_feature::AVX512BW); - - if (has_avx512f && has_avx512vl && has_avx512bw) { - // 应该根据CPU供应商和型号决定 - if (cpu_info_->vendor.find("AMD") != std::string::npos) { - EXPECT_EQ(recommended, simd_level::AVX512); - } - // Intel的情况下可能会有特殊处理 - } - } -} - -// 异常处理测试 -TEST_F(simd_test, CpuFeaturesTest_InvalidFeatureHandling) { - // 测试无效特性值的处理 - // 由于cpu_feature是enum class,编译器会阻止大多数无效值 - - // 测试边界值 - 使用一个明确未定义的特性值 - auto invalid_feature = static_cast(0); // 0值通常不代表任何特性 - EXPECT_NO_THROW({ - bool result = cpu_supports(invalid_feature); - // 0值应该返回false - EXPECT_FALSE(result); - }); - - // 测试特性位掩码的正确性 - uint32_t all_features = cpu_info_->features; - for (int bit = 0; bit < 32; ++bit) { - auto feature = static_cast(1U << bit); - bool expected = (all_features & (1U << bit)) != 0; - bool actual = cpu_supports(feature); - EXPECT_EQ(expected, actual) << "Bit " << bit << " mismatch"; - } -} - -TEST_F(simd_test, CpuFeaturesTest_ThreadSafety) { - // 测试多线程安全性 - const int num_threads = 4; - const int calls_per_thread = 100; - - std::vector threads; - std::vector results(num_threads * calls_per_thread); - - // 启动多个线程同时访问CPU特性检测 - for (int t = 0; t < num_threads; ++t) { - threads.emplace_back([&, t]() { - for (int i = 0; i < calls_per_thread; ++i) { - int idx = t * calls_per_thread + i; - - // 测试不同的API调用 - switch (i % 4) { - case 0: - results[idx] = cpu_supports(cpu_feature::SSE); - break; - case 1: - results[idx] = (get_max_simd_level() != simd_level::NONE); - break; - case 2: - results[idx] = (get_recommended_simd_level() != simd_level::NONE); - break; - case 3: - results[idx] = !get_cpu_info().vendor.empty(); - break; - } - } - }); - } - - // 等待所有线程完成 - for (auto& thread : threads) { - thread.join(); - } - - // 验证同一类型的调用返回相同结果 - bool sse_result = cpu_supports(cpu_feature::SSE); - auto max_level = get_max_simd_level(); - auto recommended_level = get_recommended_simd_level(); - bool has_vendor = !get_cpu_info().vendor.empty(); - - for (int i = 0; i < calls_per_thread; ++i) { - for (int t = 0; t < num_threads; ++t) { - int idx = t * calls_per_thread + i; - switch (i % 4) { - case 0: - EXPECT_EQ(results[idx], sse_result); - break; - case 1: - EXPECT_EQ(results[idx], (max_level != simd_level::NONE)); - break; - case 2: - EXPECT_EQ(results[idx], (recommended_level != simd_level::NONE)); - break; - case 3: - EXPECT_EQ(results[idx], has_vendor); - break; - } - } - } -} - -// ============================================================================= -// SIMD函数分发器模块测试(8个测试用例) -// ============================================================================= - -// 函数注册和查找 -TEST_F(simd_test, SimdDispatcherTest_FunctionRegistration) { - auto& dispatcher = simd_func_dispatcher::instance(); - - // 注册测试函数 - std::function scalar_add = simd_test_helpers::add_scalar; - std::function sse_add = simd_test_helpers::add_sse; - std::function avx_add = simd_test_helpers::add_avx; - - EXPECT_NO_THROW({ - dispatcher.register_function("test_add", simd_func_version::SCALAR, scalar_add); - dispatcher.register_function("test_add", simd_func_version::SSE, sse_add); - dispatcher.register_function("test_add", simd_func_version::AVX, avx_add); - }); - - // 验证函数已注册 - auto func_list = dispatcher.list_functions(); - EXPECT_TRUE(std::find(func_list.begin(), func_list.end(), "test_add") != func_list.end()); -} - -TEST_F(simd_test, SimdDispatcherTest_FunctionLookup) { - auto& dispatcher = simd_func_dispatcher::instance(); - - // 查找已注册的函数 - EXPECT_NO_THROW({ - const auto& func = dispatcher.get_function("test_add"); - - // 函数应该可以调用 - float result = func(1.0f, 2.0f); - EXPECT_GT(result, 0.0f); // 结果应该是正数 - }); - - // 查找不存在的函数应该抛出异常 - EXPECT_THROW({ - const auto& nonexistent = dispatcher.get_function("nonexistent_func"); - }, std::runtime_error); -} - -TEST_F(simd_test, SimdDispatcherTest_MultiVersionManagement) { - auto& dispatcher = simd_func_dispatcher::instance(); - - // 创建一个新的测试函数 - const std::string func_name = "multi_version_test"; - - // 注册多个版本 - dispatcher.register_function&)>( - func_name, simd_func_version::SCALAR, simd_test_helpers::sum_array_scalar); - dispatcher.register_function&)>( - func_name, simd_func_version::SSE, simd_test_helpers::sum_array_sse); - dispatcher.register_function&)>( - func_name, simd_func_version::AVX, simd_test_helpers::sum_array_avx); - - // 获取函数并测试 - const auto& func = dispatcher.get_function&)>(func_name); - - auto test_data = simd_test_helpers::generate_test_data(100); - float result = func(test_data); - - // 结果应该大于纯标量计算的结果(因为模拟的SIMD版本会增加系数) - float scalar_result = simd_test_helpers::sum_array_scalar(test_data); - EXPECT_GE(result, scalar_result); - - std::cout << "多版本结果: " << result << " (标量: " << scalar_result << ")" << std::endl; -} - -// 自动分发机制 -TEST_F(simd_test, SimdDispatcherTest_AutomaticDispatch) { - auto& dispatcher = simd_func_dispatcher::instance(); - - // 测试自动分发是否选择最佳版本 - const std::string func_name = "auto_dispatch_test"; - - // 只注册标量版本 - dispatcher.register_function( - func_name, simd_func_version::SCALAR, - [](int a, int b) { return a + b; }); - - // 根据当前系统支持,可能还会注册其他版本 - if (cpu_supports(cpu_feature::SSE)) { - dispatcher.register_function( - func_name, simd_func_version::SSE, - [](int a, int b) { return a + b + 1; }); // SSE版本加1标识 - } - - if (cpu_supports(cpu_feature::AVX)) { - dispatcher.register_function( - func_name, simd_func_version::AVX, - [](int a, int b) { return a + b + 2; }); // AVX版本加2标识 - } - - // 测试分发选择 - const auto& func = dispatcher.get_function(func_name); - int result = func(10, 20); - - // 验证选择了正确的版本 - if (cpu_supports(cpu_feature::AVX)) { - EXPECT_EQ(result, 32); // 10 + 20 + 2 - } - else if (cpu_supports(cpu_feature::SSE)) { - EXPECT_EQ(result, 31); // 10 + 20 + 1 - } - else { - EXPECT_EQ(result, 30); // 10 + 20 - } -} - -TEST_F(simd_test, SimdDispatcherTest_PriorityBasedSelection) { - // 测试基于优先级的版本选择 - auto recommended_level = get_recommended_simd_level(); - auto expected_version = simd_level_to_version(recommended_level); - - std::cout << "推荐 SIMD 级别: " << static_cast(recommended_level) << std::endl; - std::cout << "期望版本: " << static_cast(expected_version) << std::endl; - - // 验证级别转换函数 - EXPECT_GE(static_cast(expected_version), static_cast(simd_func_version::SCALAR)); - EXPECT_LE(static_cast(expected_version), static_cast(simd_func_version::VECTOR)); - - // 测试转换一致性 - switch (recommended_level) { - case simd_level::NONE: - EXPECT_EQ(expected_version, simd_func_version::SCALAR); - break; - case simd_level::SSE: - EXPECT_EQ(expected_version, simd_func_version::SSE); - break; - case simd_level::AVX: - EXPECT_EQ(expected_version, simd_func_version::AVX); - break; - case simd_level::AVX2: - EXPECT_EQ(expected_version, simd_func_version::AVX2); - break; - default: - // 其他情况也应该有对应的版本 - break; - } -} - -TEST_F(simd_test, SimdDispatcherTest_VersionFallback) { - auto& dispatcher = simd_func_dispatcher::instance(); - const std::string func_name = "fallback_test"; - - // 只注册标量版本,测试回退机制 - dispatcher.register_function( - func_name, simd_func_version::SCALAR, - [](double x) { return x * 2.0; }); - - // 即使系统支持更高级的SIMD,也应该回退到标量版本 - const auto& func = dispatcher.get_function(func_name); - double result = func(3.14); - EXPECT_DOUBLE_EQ(result, 6.28); - - // 现在注册一个高级版本 - if (cpu_supports(cpu_feature::AVX)) { - dispatcher.register_function( - func_name, simd_func_version::AVX, - [](double x) { return x * 3.0; }); // 不同的计算以验证选择了正确版本 - - // 重新获取函数,应该选择AVX版本 - const auto& avx_func = dispatcher.get_function(func_name); - double avx_result = avx_func(3.14); - EXPECT_DOUBLE_EQ(avx_result, 9.42); - } -} - -// 宏接口测试 -TEST_F(simd_test, SimdDispatcherTest_MacroInterface) { - // 测试注册宏 - EXPECT_NO_THROW({ - std::function square_func = [](int x) { return x * x; }; - REGISTER_SIMD_FUNCTION("macro_test", simd_func_version::SCALAR, square_func); - }); - - // 测试获取宏 - EXPECT_NO_THROW({ - const auto& func = GET_SIMD_FUNCTION(int(int), "macro_test"); - int result = func(5); - EXPECT_EQ(result, 25); - }); - - // 测试调用宏 - EXPECT_NO_THROW({ - int result = CALL_SIMD_FUNCTION(int(int), "macro_test", 6); - EXPECT_EQ(result, 36); - }); - - // 测试字符串转换函数 - EXPECT_STREQ(simd_func_version_to_string(simd_func_version::SCALAR), "SCALAR"); - EXPECT_STREQ(simd_func_version_to_string(simd_func_version::SSE), "SSE"); - EXPECT_STREQ(simd_func_version_to_string(simd_func_version::AVX), "AVX"); - - EXPECT_EQ(string_to_simd_func_version("SCALAR"), simd_func_version::SCALAR); - EXPECT_EQ(string_to_simd_func_version("SSE"), simd_func_version::SSE); - EXPECT_EQ(string_to_simd_func_version("AVX"), simd_func_version::AVX); - EXPECT_EQ(string_to_simd_func_version("INVALID"), simd_func_version::SCALAR); // 默认回退 -} - -TEST_F(simd_test, SimdDispatcherTest_TypeSafety) { - auto& dispatcher = simd_func_dispatcher::instance(); - - // 注册不同类型的函数 - dispatcher.register_function("int_func", simd_func_version::SCALAR, - [](int x) { return x + 1; }); - dispatcher.register_function("float_func", simd_func_version::SCALAR, - [](float x) { return x + 1.0f; }); - - // 类型安全检查 - EXPECT_NO_THROW({ - const auto& int_func = dispatcher.get_function("int_func"); - int result = int_func(42); - EXPECT_EQ(result, 43); - }); - - EXPECT_NO_THROW({ - const auto& float_func = dispatcher.get_function("float_func"); - float result = float_func(3.14f); - EXPECT_FLOAT_EQ(result, 4.14f); - }); - - // 尝试用不同的类型获取同名函数会创建独立的函数持有者 - EXPECT_NO_THROW({ - // 这会创建一个新的double类型函数持有者,与int类型的是分离的 - const auto& double_func = dispatcher.get_function("int_func"); - // 这验证了类型安全性 - 不同类型的函数是分离的 - }); -} - -// 错误处理 -TEST_F(simd_test, SimdDispatcherTest_InvalidRegistration) { - auto& dispatcher = simd_func_dispatcher::instance(); - - // 测试重复注册相同版本 - EXPECT_NO_THROW({ - dispatcher.register_function("duplicate_test", simd_func_version::SCALAR, - []() { return 1; }); - dispatcher.register_function("duplicate_test", simd_func_version::SCALAR, - []() { return 2; }); // 覆盖前一个 - }); - - // 验证最后注册的版本生效 - const auto& func = dispatcher.get_function("duplicate_test"); - int result = func(); - EXPECT_EQ(result, 2); -} - -TEST_F(simd_test, SimdDispatcherTest_MissingFunction) { - auto& dispatcher = simd_func_dispatcher::instance(); - - // 尝试获取未注册的函数应该抛出异常 - EXPECT_THROW({ - const auto& missing_func = dispatcher.get_function("nonexistent_function"); - }, std::runtime_error); - - // 尝试调用未注册的函数 - EXPECT_THROW({ - CALL_SIMD_FUNCTION(void(), "another_nonexistent_function"); - }, std::runtime_error); -} - -// ============================================================================= -// 对齐内存分配器模块测试(9个测试用例) -// ============================================================================= - -// 基础分配测试 -TEST_F(simd_test, AlignedAllocatorTest_BasicAllocation) { - // 测试基本的对齐分配 - constexpr size_t alignment = ALIGNMENT_AVX; // 32字节对齐 - constexpr size_t size = 1024; - - void* ptr = aligned_malloc(size, alignment); - ASSERT_NE(ptr, nullptr); - EXPECT_TRUE(simd_test_helpers::is_properly_aligned(ptr)); - - // 写入数据验证可用性 - auto* data = static_cast(ptr); - for (size_t i = 0; i < size; ++i) { - data[i] = static_cast(i % 256); - } - - // 验证数据 - for (size_t i = 0; i < size; ++i) { - EXPECT_EQ(data[i], static_cast(i % 256)); - } - - aligned_free(ptr); -} - -TEST_F(simd_test, AlignedAllocatorTest_VariousAlignments) { - // 测试不同的对齐要求 - std::vector alignments = { - ALIGNMENT_SSE, // 16字节 - ALIGNMENT_AVX, // 32字节 - ALIGNMENT_AVX512, // 64字节 - ALIGNMENT_CACHE // 64字节(缓存行) - }; - - constexpr size_t size = 256; - - for (auto alignment : alignments) { - void* ptr = aligned_malloc(size, alignment); - ASSERT_NE(ptr, nullptr) << "Failed to allocate with alignment " << alignment; - - EXPECT_TRUE(is_aligned(ptr, alignment)) - << "Pointer not properly aligned to " << alignment << " bytes"; - - // 验证可以写入数据 - std::memset(ptr, 0xAB, size); - - aligned_free(ptr); - } -} - -TEST_F(simd_test, AlignedAllocatorTest_LargeAllocations) { - // 测试大块内存分配 - std::vector sizes = { - 1024, // 1KB - 1024 * 64, // 64KB - 1024 * 1024 // 1MB - }; - - constexpr size_t alignment = ALIGNMENT_AVX; - - for (auto size : sizes) { - void* ptr = aligned_malloc(size, alignment); - ASSERT_NE(ptr, nullptr) << "Failed to allocate " << size << " bytes"; - - EXPECT_TRUE(simd_test_helpers::is_properly_aligned(ptr)); - - // 简单的读写测试 - auto* data = static_cast(ptr); - data[0] = 0x12345678; - data[size / sizeof(int) - 1] = 0x87654321; - - EXPECT_EQ(data[0], 0x12345678); - EXPECT_EQ(data[size/sizeof(int) - 1], 0x87654321); - - aligned_free(ptr); - } -} - -// STL兼容性 -TEST_F(simd_test, AlignedAllocatorTest_StlContainerCompat) { - // 测试STL容器兼容性(需要修复aligned_allocator中的错误) - using aligned_vector = std::vector>; - - EXPECT_NO_THROW({ - aligned_vector vec; - vec.reserve(100); - - for (int i = 0; i < 50; ++i) { - vec.push_back(static_cast(i)); - } - - EXPECT_EQ(vec.size(), 50); - EXPECT_GE(vec.capacity(), 50); - - // 验证对齐 - if (!vec.empty()) { - EXPECT_TRUE(simd_test_helpers::is_properly_aligned(vec.data())); - } - }); -} - -TEST_F(simd_test, AlignedAllocatorTest_VectorOperations) { - using sse_vector = std::vector>; - using avx_vector = std::vector>; - - // SSE对齐的vector - sse_vector sse_vec(100, 3.14); - EXPECT_EQ(sse_vec.size(), 100); - EXPECT_TRUE(simd_test_helpers::is_properly_aligned(sse_vec.data())); - - // AVX对齐的vector - avx_vector avx_vec(200, 2.71f); - EXPECT_EQ(avx_vec.size(), 200); - EXPECT_TRUE(simd_test_helpers::is_properly_aligned(avx_vec.data())); - - // 测试resize操作 - sse_vec.resize(200); - EXPECT_EQ(sse_vec.size(), 200); - if (!sse_vec.empty()) { - EXPECT_TRUE(simd_test_helpers::is_properly_aligned(sse_vec.data())); - } -} - -TEST_F(simd_test, AlignedAllocatorTest_MemoryManagement) { - using cache_vector = std::vector>; - - // 测试内存管理 - { - cache_vector vec(1000); - std::iota(vec.begin(), vec.end(), 0); - - EXPECT_TRUE(simd_test_helpers::is_properly_aligned(vec.data())); - - // 验证数据正确性 - for (size_t i = 0; i < vec.size(); ++i) { - EXPECT_EQ(vec[i], static_cast(i)); - } - } // vector销毁,测试析构函数 - - // 测试移动语义 - cache_vector vec1(100, 42); - auto vec1_data = vec1.data(); - - cache_vector vec2 = std::move(vec1); - EXPECT_EQ(vec2.size(), 100); - EXPECT_EQ(vec2.data(), vec1_data); // 移动后数据指针应该相同 - EXPECT_TRUE(vec1.empty() || vec1.data() != vec1_data); // vec1应该被清空或数据被移走 -} - -// 跨平台行为 -TEST_F(simd_test, AlignedAllocatorTest_PlatformConsistency) { - // 测试跨平台的一致行为 - constexpr size_t alignment = 32; - constexpr size_t size = 1024; - - std::vector ptrs; - - // 分配多个内存块 - for (int i = 0; i < 10; ++i) { - void* ptr = aligned_malloc(size, alignment); - ASSERT_NE(ptr, nullptr); - EXPECT_TRUE(is_aligned(ptr, alignment)); - ptrs.push_back(ptr); - } - - // 验证所有指针都正确对齐 - for (auto ptr : ptrs) { - EXPECT_TRUE(is_aligned(ptr, alignment)); - - // 写入特定模式 - auto* data = static_cast(ptr); - for (size_t j = 0; j < size / sizeof(uint32_t); ++j) { - data[j] = static_cast(j * 0x12345678); - } - } - - // 验证数据完整性 - for (size_t i = 0; i < ptrs.size(); ++i) { - auto* data = static_cast(ptrs[i]); - for (size_t j = 0; j < size / sizeof(uint32_t); ++j) { - EXPECT_EQ(data[j], static_cast(j * 0x12345678)) - << "Data corruption at ptr " << i << ", index " << j; - } - } - - // 释放所有内存 - for (auto ptr : ptrs) { - aligned_free(ptr); - } -} - -TEST_F(simd_test, AlignedAllocatorTest_AlignmentVerification) { - // 测试对齐验证函数 - std::vector test_alignments = {1, 2, 4, 8, 16, 32, 64, 128}; - - for (auto alignment : test_alignments) { - // 测试2的幂次对齐 - if ((alignment & (alignment - 1)) == 0) { - // 是2的幂 - void* ptr = aligned_malloc(256, alignment); - ASSERT_NE(ptr, nullptr); - EXPECT_TRUE(is_aligned(ptr, alignment)); - aligned_free(ptr); - } - else { - // 非2的幂次应该返回nullptr - void* ptr = aligned_malloc(256, alignment); - EXPECT_EQ(ptr, nullptr); - } - } - - // 测试边界情况 - EXPECT_EQ(aligned_malloc(100, 0), nullptr); // 0对齐应该失败 - - // 测试align_size函数 - EXPECT_EQ(align_size(15, 16), 16); - EXPECT_EQ(align_size(16, 16), 16); - EXPECT_EQ(align_size(17, 16), 32); - EXPECT_EQ(align_size(31, 32), 32); - EXPECT_EQ(align_size(33, 32), 64); -} - -TEST_F(simd_test, AlignedAllocatorTest_PerformanceCharacteristics) { - // 简单的性能特征测试 - constexpr size_t num_allocations = 1000; - constexpr size_t allocation_size = 1024; - - // 测试对齐分配的性能 - simd_test_helpers::timer timer; - - std::vector aligned_ptrs; - aligned_ptrs.reserve(num_allocations); - - // 分配阶段 - for (size_t i = 0; i < num_allocations; ++i) { - void* ptr = aligned_malloc(allocation_size, ALIGNMENT_AVX); - ASSERT_NE(ptr, nullptr); - aligned_ptrs.push_back(ptr); - } - - double allocation_time = timer.elapsed_ms(); - - // 访问测试 - simd_test_helpers::timer access_timer; - uint64_t checksum = 0; - - // 记录开始时间 - auto start_time = std::chrono::high_resolution_clock::now(); - - for (auto ptr : aligned_ptrs) { - auto* data = static_cast(ptr); - checksum += data[0]; // 简单访问测试 - } - - auto end_time = std::chrono::high_resolution_clock::now(); - auto duration_ns = std::chrono::duration_cast(end_time - start_time).count(); - - double access_time = access_timer.elapsed_ms(); - - // 诊断日志 - std::cout << " [诊断] 访问循环耗时: " << duration_ns << " 纳秒" << std::endl; - std::cout << " [诊断] 计时器测量的访问时间: " << access_time << " 毫秒" << std::endl; - std::cout << " [诊断] 校验和值: " << checksum << std::endl; - std::cout << " [诊断] 分配数量: " << aligned_ptrs.size() << std::endl; - - // 释放阶段 - simd_test_helpers::timer free_timer; - - for (auto ptr : aligned_ptrs) { - aligned_free(ptr); - } - - double free_time = free_timer.elapsed_ms(); - - // 性能报告 - std::cout << "对齐分配性能:" << std::endl; - std::cout << " 分配次数: " << num_allocations << " x " << allocation_size << " 字节" << std::endl; - std::cout << " 分配时间: " << allocation_time << " 毫秒" << std::endl; - std::cout << " 访问时间: " << access_time << " 毫秒" << std::endl; - std::cout << " 释放时间: " << free_time << " 毫秒" << std::endl; - std::cout << " 平均分配时间: " << (allocation_time / num_allocations) << " 毫秒" << std::endl; - - // 基本合理性检查 - EXPECT_GT(allocation_time, 0.0); - // 访问时间可能因为优化而接近0,特别是在release模式下 - // 改为检查访问时间 >= 0 而不是严格大于0 - EXPECT_GE(access_time, 0.0) << "Access time should be non-negative (may be 0 in optimized builds)"; - std::cout << " [注意] 访问时间为 " << access_time << " 毫秒 - 在发布模式下由于编译器优化可能为 0" << std::endl; - EXPECT_GT(free_time, 0.0); - - // 避免编译器优化掉checksum计算 - EXPECT_GE(checksum, 0); // checksum可能为0,但应该不会是负数 -} - -// ============================================================================= -// 集成和性能测试(4个测试用例) -// ============================================================================= - -// 端到端集成测试 -TEST_F(simd_test, SimdIntegrationTest_FullWorkflow) { - // 完整的SIMD工作流程测试:检测 -> 分发 -> 分配 -> 执行 - - // 1. CPU特性检测 - auto max_level = get_max_simd_level(); - auto recommended_level = get_recommended_simd_level(); - - std::cout << "集成测试 - SIMD 级别: 最大=" << static_cast(max_level) - << ", 推荐=" << static_cast(recommended_level) << std::endl; - - // 2. 注册多版本函数 - auto& dispatcher = simd_func_dispatcher::instance(); - const std::string func_name = "integration_vector_sum"; - - // 使用对齐分配器的向量进行计算 - using aligned_float_vector = std::vector>; - - // 注册标量版本 - dispatcher.register_function( - func_name, simd_func_version::SCALAR, - [](const aligned_float_vector& vec) -> float { - float sum = 0.0f; - for (const auto& val : vec) { - sum += val; - } - return sum; - }); - - // 根据支持的特性注册优化版本 - if (cpu_supports(cpu_feature::SSE)) { - dispatcher.register_function( - func_name, simd_func_version::SSE, - [](const aligned_float_vector& vec) -> float { - // 模拟SSE优化(实际实现会使用SSE指令) - float sum = 0.0f; - for (const auto& val : vec) { - sum += val; - } - return sum * 1.001f; // 添加小的标识以区分版本 - }); - } - - if (cpu_supports(cpu_feature::AVX)) { - dispatcher.register_function( - func_name, simd_func_version::AVX, - [](const aligned_float_vector& vec) -> float { - // 模拟AVX优化 - float sum = 0.0f; - for (const auto& val : vec) { - sum += val; - } - return sum * 1.002f; // AVX版本标识 - }); - } - - // 3. 创建测试数据(使用对齐分配) - aligned_float_vector test_data(10000); - std::iota(test_data.begin(), test_data.end(), 1.0f); - - // 验证数据对齐 - EXPECT_TRUE(simd_test_helpers::is_properly_aligned(test_data.data())); - - // 4. 执行计算 - const auto& func = dispatcher.get_function(func_name); - float result = func(test_data); - - // 5. 验证结果 - float expected_base = 10000.0f * 10001.0f / 2.0f; // 等差数列求和 - EXPECT_GT(result, expected_base * 0.99f); // 允许一定的误差和版本差异 - EXPECT_LT(result, expected_base * 1.01f); - - std::cout << "集成测试结果: " << result << " (期望约 " << expected_base << ")" << std::endl; -} - -TEST_F(simd_test, SimdIntegrationTest_RealWorldScenarios) { - // 真实世界场景测试:图像处理、数值计算等 - - // 场景1:向量点积计算 - const size_t vector_size = 1024; - using aligned_vector = std::vector>; - - aligned_vector vec_a(vector_size), vec_b(vector_size); - - // 初始化向量 - for (size_t i = 0; i < vector_size; ++i) { - vec_a[i] = static_cast(i + 1); - vec_b[i] = static_cast((i + 1) * 2); - } - - // 注册点积函数 - auto& dispatcher = simd_func_dispatcher::instance(); - const std::string dot_product_name = "dot_product"; - - dispatcher.register_function( - dot_product_name, simd_func_version::SCALAR, - [](const aligned_vector& a, const aligned_vector& b) -> float { - float result = 0.0f; - for (size_t i = 0; i < a.size(); ++i) { - result += a[i] * b[i]; - } - return result; - }); - - // 执行点积计算 - float dot_result = CALL_SIMD_FUNCTION(float(const aligned_vector&, const aligned_vector&), - dot_product_name, vec_a, vec_b); - - // 验证结果(数学验证) - float expected = 0.0f; - for (size_t i = 0; i < vector_size; ++i) { - expected += vec_a[i] * vec_b[i]; - } - EXPECT_FLOAT_EQ(dot_result, expected); - - // 场景2:矩阵转置(简化版) - const size_t matrix_size = 64; // 64x64矩阵 - aligned_vector matrix(matrix_size * matrix_size); - aligned_vector transposed(matrix_size * matrix_size); - - // 初始化矩阵 - for (size_t i = 0; i < matrix_size; ++i) { - for (size_t j = 0; j < matrix_size; ++j) { - matrix[i * matrix_size + j] = static_cast(i * matrix_size + j); - } - } - - // 矩阵转置 - const std::string transpose_name = "matrix_transpose"; - dispatcher.register_function( - transpose_name, simd_func_version::SCALAR, - [](const aligned_vector& src, aligned_vector& dst, size_t size) { - for (size_t i = 0; i < size; ++i) { - for (size_t j = 0; j < size; ++j) { - dst[j * size + i] = src[i * size + j]; - } - } - }); - - CALL_SIMD_FUNCTION(void(const aligned_vector&, aligned_vector&, size_t), - transpose_name, matrix, transposed, matrix_size); - - // 验证转置结果 - for (size_t i = 0; i < matrix_size; ++i) { - for (size_t j = 0; j < matrix_size; ++j) { - EXPECT_FLOAT_EQ(transposed[j * matrix_size + i], matrix[i * matrix_size + j]); - } - } - - std::cout << "真实场景测试成功完成" << std::endl; -} - -// 性能基准测试 -TEST_F(simd_test, SimdPerformanceTest_AllocationSpeed) { - // 对齐分配性能基准测试 - - struct BenchmarkConfig { - size_t allocation_size; - size_t alignment; - size_t num_iterations; - std::string name; - }; - - std::vector configs = { - {1024, ALIGNMENT_SSE, 10000, "SSE-1KB"}, - {1024, ALIGNMENT_AVX, 10000, "AVX-1KB"}, - {1024, ALIGNMENT_AVX512, 10000, "AVX512-1KB"}, - {4096, ALIGNMENT_AVX, 5000, "AVX-4KB"}, - {16384, ALIGNMENT_AVX, 2000, "AVX-16KB"}, - {65536, ALIGNMENT_AVX, 1000, "AVX-64KB"} - }; - - std::cout << "\n分配速度基准测试:" << std::endl; - std::cout << "配置\t\t分配(毫秒)\t释放(毫秒)\t总计(毫秒)" << std::endl; - - for (const auto& config : configs) { - std::vector ptrs; - ptrs.reserve(config.num_iterations); - - // 分配基准 - simd_test_helpers::timer alloc_timer; - for (size_t i = 0; i < config.num_iterations; ++i) { - void* ptr = aligned_malloc(config.allocation_size, config.alignment); - ASSERT_NE(ptr, nullptr); - ptrs.push_back(ptr); - } - double alloc_time = alloc_timer.elapsed_ms(); - - // 释放基准 - simd_test_helpers::timer free_timer; - for (auto ptr : ptrs) { - aligned_free(ptr); - } - double free_time = free_timer.elapsed_ms(); - - double total_time = alloc_time + free_time; - - std::cout << config.name << "\t\t" - << std::fixed << std::setprecision(2) - << alloc_time << "\t\t" - << free_time << "\t\t" - << total_time << std::endl; - - // 基本性能断言 - EXPECT_GT(alloc_time, 0.0); - EXPECT_GT(free_time, 0.0); - EXPECT_LT(alloc_time / config.num_iterations, 1.0); // 平均每次分配应该小于1ms - } -} - -TEST_F(simd_test, SimdPerformanceTest_DispatchOverhead) { - // 函数分发开销基准测试 - - auto& dispatcher = simd_func_dispatcher::instance(); - const std::string bench_func_name = "dispatch_overhead_test"; - - // 注册一个简单的测试函数 - dispatcher.register_function( - bench_func_name, simd_func_version::SCALAR, - [](int x) { return x + 1; }); - - if (cpu_supports(cpu_feature::SSE)) { - dispatcher.register_function( - bench_func_name, simd_func_version::SSE, - [](int x) { return x + 2; }); - } - - const size_t num_calls = 1000000; // 100万次调用 - - // 基准1:直接函数调用 - auto direct_func = [](int x) { return x + 1; }; - - simd_test_helpers::timer direct_timer; - volatile int direct_result = 0; // volatile防止优化 - for (size_t i = 0; i < num_calls; ++i) { - direct_result += direct_func(static_cast(i)); - } - double direct_time = direct_timer.elapsed_ms(); - - // 基准2:通过分发器调用 - const auto& dispatched_func = dispatcher.get_function(bench_func_name); - - simd_test_helpers::timer dispatch_timer; - volatile int dispatch_result = 0; - for (size_t i = 0; i < num_calls; ++i) { - dispatch_result += dispatched_func(static_cast(i)); - } - double dispatch_time = dispatch_timer.elapsed_ms(); - - // 基准3:通过宏调用 - simd_test_helpers::timer macro_timer; - volatile int macro_result = 0; - for (size_t i = 0; i < num_calls; ++i) { - macro_result += CALL_SIMD_FUNCTION(int(int), bench_func_name, static_cast(i)); - } - double macro_time = macro_timer.elapsed_ms(); - - // 结果报告 - std::cout << "\n分发开销基准测试 (" << num_calls << " 次调用):" << std::endl; - std::cout << "直接函数调用: " << direct_time << " 毫秒" << std::endl; - std::cout << "分发函数调用: " << dispatch_time << " 毫秒" << std::endl; - std::cout << "宏调用: " << macro_time << " 毫秒" << std::endl; - - double dispatch_overhead = (dispatch_time - direct_time) / direct_time * 100.0; - double macro_overhead = (macro_time - direct_time) / direct_time * 100.0; - - std::cout << "分发开销: " << std::fixed << std::setprecision(2) - << dispatch_overhead << "%" << std::endl; - std::cout << "宏调用开销: " << macro_overhead << "%" << std::endl; - - // 性能断言 - EXPECT_GT(direct_time, 0.0); - EXPECT_GT(dispatch_time, 0.0); - EXPECT_GT(macro_time, 0.0); - - // 分发开销应该在合理范围内(调整为更现实的阈值) - EXPECT_LT(dispatch_overhead, 1000.0); // 允许10倍开销 - EXPECT_LT(macro_overhead, 10000.0); // 宏调用开销更大 - - // 验证结果正确性(防止编译器优化掉计算) - EXPECT_GT(direct_result, 0); - EXPECT_GT(dispatch_result, 0); - EXPECT_GT(macro_result, 0); -} - -// ============================================================================= -// 测试主入口点 -// ============================================================================= - -// 在测试开始前打印系统信息 -class SimdTestEnvironment : public ::testing::Environment { -public: - void SetUp() override { - std::cout << "\n" << std::string(60, '=') << std::endl; - std::cout << "SIMD 测试套件 - 系统信息" << std::endl; - std::cout << std::string(60, '=') << std::endl; - - cpu_feature_detector::instance().print_info(); - - std::cout << std::string(60, '=') << std::endl; - std::cout << "开始 SIMD 测试..." << std::endl; - std::cout << std::string(60, '=') << std::endl; - } - - void TearDown() override { - std::cout << std::string(60, '=') << std::endl; - std::cout << "SIMD 测试套件完成。" << std::endl; - std::cout << std::string(60, '=') << std::endl; - } -}; - -// 注册测试环境 -static ::testing::Environment* const simd_test_env = - ::testing::AddGlobalTestEnvironment(new SimdTestEnvironment);