diff --git a/src/common/str_utils.h b/src/common/str_utils.h new file mode 100644 index 0000000..c477209 --- /dev/null +++ b/src/common/str_utils.h @@ -0,0 +1,149 @@ +#pragma once +#include +#include +#include +#include +#include +#include + +namespace str_utils { + /// @brief UTF-8 序列模式定义 + struct utf8_pattern { + uint8_t mask; + uint8_t pattern; + uint8_t data_mask; + size_t length; + }; + + /// @brief UTF-8 编码模式表(编译期常量) + inline constexpr std::array utf8_patterns{ + { + {0x80, 0x00, 0x7F, 1}, // 1字节: 0xxxxxxx + {0xE0, 0xC0, 0x1F, 2}, // 2字节: 110xxxxx + {0xF0, 0xE0, 0x0F, 3}, // 3字节: 1110xxxx + {0xF8, 0xF0, 0x07, 4} // 4字节: 11110xxx + } + }; + + /// @brief 解码错误类型 + struct decode_error { + size_t bytes_to_skip; // 需要跳过的字节数 + }; + + /// @brief 解码成功结果 + struct decode_result { + char32_t codepoint; + size_t bytes_consumed; + }; + + /// @brief 解码单个 UTF-8 字符 + /// @param bytes 输入字节序列 + /// @return 成功时返回码点和消耗字节数,失败时返回需要跳过的字节数 + [[nodiscard]] constexpr auto decode_utf8_char(std::span bytes) + -> std::expected { + if (bytes.empty()) [[unlikely]] { + return std::unexpected{decode_error{0}}; + } + + const uint8_t first = bytes[0]; + + // 使用 ranges::find_if 查找匹配的模式 + auto match = std::ranges::find_if(utf8_patterns, [first](const auto& p) { + return (first & p.mask) == p.pattern; + }); + + if (match == utf8_patterns.end()) [[unlikely]] { + return std::unexpected{decode_error{1}}; + } + + const auto& [mask, pattern, data_mask, length] = *match; + + if (bytes.size() < length) [[unlikely]] { + return std::unexpected{decode_error{0}}; + } + + char32_t codepoint = first & data_mask; + + // 使用 ranges::subrange 处理后续字节 + auto continuation_bytes = bytes.subspan(1, length - 1); + for (const auto byte : continuation_bytes) { + if ((byte & 0xC0) != 0x80) [[unlikely]] { + return std::unexpected{decode_error{0}}; + } + codepoint = (codepoint << 6) | (byte & 0x3F); + } + + return decode_result{codepoint, length}; + } + + /// @brief 编码单个 UTF-32 码点为 UTF-8 + /// @param codepoint UTF-32 码点 + /// @param output 输出字符串引用 + constexpr void encode_utf8_char(char32_t codepoint, std::string& output) { + if (codepoint <= 0x7F) { + output.push_back(static_cast(codepoint)); + } + else if (codepoint <= 0x7FF) { + output.push_back(static_cast(0xC0 | ((codepoint >> 6) & 0x1F))); + output.push_back(static_cast(0x80 | (codepoint & 0x3F))); + } + else if (codepoint <= 0xFFFF) { + output.push_back(static_cast(0xE0 | ((codepoint >> 12) & 0x0F))); + output.push_back(static_cast(0x80 | ((codepoint >> 6) & 0x3F))); + output.push_back(static_cast(0x80 | (codepoint & 0x3F))); + } + else if (codepoint <= 0x10FFFF) [[likely]] { + output.push_back(static_cast(0xF0 | ((codepoint >> 18) & 0x07))); + output.push_back(static_cast(0x80 | ((codepoint >> 12) & 0x3F))); + output.push_back(static_cast(0x80 | ((codepoint >> 6) & 0x3F))); + output.push_back(static_cast(0x80 | (codepoint & 0x3F))); + } + // 超出 Unicode 范围的码点被静默跳过 + } +} // namespace str_utils + +/// @brief 将 UTF-8 字符串转换为 UTF-32 +[[nodiscard]] inline auto utf8_to_utf32(std::string_view utf8) -> std::u32string { + using namespace str_utils; + + std::u32string result; + result.reserve(utf8.size()); // 最坏情况预留 + + const std::span bytes{ + reinterpret_cast(utf8.data()), + utf8.size() + }; + + for (size_t pos = 0; pos < bytes.size();) { + if (auto decoded = decode_utf8_char(bytes.subspan(pos))) [[likely]] { + result.push_back(decoded->codepoint); + pos += decoded->bytes_consumed; + } + else { + // 跳过无效字节 + const auto skip = decoded.error().bytes_to_skip; + pos += (skip > 0 ? skip : 1); + } + } + + return result; +} + +/// @brief 将 UTF-32 字符串转换为 UTF-8 +[[nodiscard]] inline auto utf32_to_utf8(std::u32string_view utf32) -> std::string { + using namespace str_utils; + + if (utf32.empty()) { + return {}; + } + + std::string result; + result.reserve(utf32.size() * 2); // 预留空间(通常UTF-8会更长) + + // 使用 ranges::for_each 遍历并编码每个码点 + std::ranges::for_each(utf32, [&result](char32_t cp) { + encode_utf8_char(cp, result); + }); + + return result; +} diff --git a/src/render/pipeline/render_tree_builder.cpp b/src/render/pipeline/render_tree_builder.cpp index d9f7de2..b12a9da 100644 --- a/src/render/pipeline/render_tree_builder.cpp +++ b/src/render/pipeline/render_tree_builder.cpp @@ -6,6 +6,7 @@ #include #include #include +#include "str_utils.h" namespace mirage::render { @@ -194,7 +195,7 @@ namespace { } else if constexpr (std::is_same_v) { // 转换UTF-8文本为UTF-32 - auto utf32_text = text::text_shaper::utf8_to_utf32(concrete_cmd.text); + auto utf32_text = utf8_to_utf32(concrete_cmd.text); // 使用text_shaper生成文本顶点数据 auto shaped = text_shaper.shape_text( @@ -224,7 +225,7 @@ namespace { } else if constexpr (std::is_same_v) { // 转换UTF-8文本为UTF-32 - auto utf32_text = text::text_shaper::utf8_to_utf32(concrete_cmd.text); + auto utf32_text = utf8_to_utf32(concrete_cmd.text); // 使用text_shaper生成文本顶点数据 auto shaped = text_shaper.shape_text( diff --git a/src/render/text/text_shaper.cpp b/src/render/text/text_shaper.cpp index ad59dd3..9cd018d 100644 --- a/src/render/text/text_shaper.cpp +++ b/src/render/text/text_shaper.cpp @@ -215,88 +215,4 @@ namespace mirage::render::text { .height = max_y - min_y }; } - - auto text_shaper::utf8_to_utf32(std::string_view utf8) -> std::u32string { - std::u32string result; - result.reserve(utf8.size()); // 最坏情况预留 - - // UTF-8 解码辅助结构 - struct utf8_decode_result { - char32_t codepoint; - size_t bytes_consumed; - bool valid; - }; - - // 使用 constexpr lambda 进行 UTF-8 解码 - constexpr auto decode_utf8_char = [](std::span bytes) -> utf8_decode_result { - if (bytes.empty()) [[unlikely]] { - return {0, 0, false}; - } - - const uint8_t first = bytes[0]; - - // 使用位掩码模式匹配 - struct sequence_pattern { - uint8_t mask; - uint8_t pattern; - uint8_t data_mask; - size_t length; - }; - - constexpr std::array patterns = { - { - {0x80, 0x00, 0x7F, 1}, // 1字节: 0xxxxxxx - {0xE0, 0xC0, 0x1F, 2}, // 2字节: 110xxxxx - {0xF0, 0xE0, 0x0F, 3}, // 3字节: 1110xxxx - {0xF8, 0xF0, 0x07, 4} // 4字节: 11110xxx - } - }; - - // 查找匹配的模式 - for (const auto& [mask, pattern, data_mask, length] : patterns) { - if ((first & mask) == pattern) { - if (bytes.size() < length) [[unlikely]] { - return {0, 0, false}; - } - - char32_t codepoint = first & data_mask; - - // 读取后续字节(continuation bytes: 10xxxxxx) - for (size_t j = 1; j < length; ++j) { - if ((bytes[j] & 0xC0) != 0x80) [[unlikely]] { - return {0, 0, false}; - } - codepoint = (codepoint << 6) | (bytes[j] & 0x3F); - } - - return {codepoint, length, true}; - } - } - - // 无效序列 - return {0, 1, false}; - }; - - // 使用 span 安全地处理字节序列 - const std::span bytes{ - reinterpret_cast(utf8.data()), - utf8.size() - }; - - size_t pos = 0; - while (pos < bytes.size()) { - const auto [codepoint, consumed, valid] = decode_utf8_char(bytes.subspan(pos)); - - if (valid && consumed > 0) [[likely]] { - result.push_back(codepoint); - pos += consumed; - } - else { - // 跳过无效字节 - pos += (consumed > 0 ? consumed : 1); - } - } - - return result; - } } // namespace mirage::render::text diff --git a/src/render/text/text_shaper.h b/src/render/text/text_shaper.h index 363e4d2..be614bf 100644 --- a/src/render/text/text_shaper.h +++ b/src/render/text/text_shaper.h @@ -44,10 +44,6 @@ public: font_manager::font_id_t font_id, float font_size ) -> text_metrics; - - // UTF-8转UTF-32 - static auto utf8_to_utf32(std::string_view utf8) -> std::u32string; - private: font_manager& font_mgr_; glyph_cache& cache_; diff --git a/src/widget/widgets/text_input/text_model.cpp b/src/widget/widgets/text_input/text_model.cpp index 6dde339..5945206 100644 --- a/src/widget/widgets/text_input/text_model.cpp +++ b/src/widget/widgets/text_input/text_model.cpp @@ -3,6 +3,7 @@ #include #include #include +#include "str_utils.h" namespace mirage { @@ -12,100 +13,6 @@ namespace mirage { namespace { -/// @brief 将 UTF-8 字符串转换为 UTF-32 -[[nodiscard]] std::u32string utf8_to_utf32(std::string_view utf8) { - if (utf8.empty()) { - return {}; - } - - std::u32string result; - result.reserve(utf8.size()); // 预留空间(通常UTF-32会更短) - - size_t i = 0; - while (i < utf8.size()) { - char32_t codepoint = 0; - unsigned char byte = static_cast(utf8[i]); - - if (byte <= 0x7F) { - // 1字节序列 (ASCII) - codepoint = byte; - i += 1; - } else if ((byte & 0xE0) == 0xC0) { - // 2字节序列 - if (i + 1 < utf8.size()) { - codepoint = ((byte & 0x1F) << 6) | - (utf8[i + 1] & 0x3F); - i += 2; - } else { - break; // 不完整的序列 - } - } else if ((byte & 0xF0) == 0xE0) { - // 3字节序列 - if (i + 2 < utf8.size()) { - codepoint = ((byte & 0x0F) << 12) | - ((utf8[i + 1] & 0x3F) << 6) | - (utf8[i + 2] & 0x3F); - i += 3; - } else { - break; - } - } else if ((byte & 0xF8) == 0xF0) { - // 4字节序列 - if (i + 3 < utf8.size()) { - codepoint = ((byte & 0x07) << 18) | - ((utf8[i + 1] & 0x3F) << 12) | - ((utf8[i + 2] & 0x3F) << 6) | - (utf8[i + 3] & 0x3F); - i += 4; - } else { - break; - } - } else { - // 无效的UTF-8序列,跳过 - i += 1; - continue; - } - - result.push_back(codepoint); - } - - return result; -} - -/// @brief 将 UTF-32 字符串转换为 UTF-8 -[[nodiscard]] std::string utf32_to_utf8(std::u32string_view utf32) { - if (utf32.empty()) { - return {}; - } - - std::string result; - result.reserve(utf32.size() * 2); // 预留空间(通常UTF-8会更长) - - for (char32_t codepoint : utf32) { - if (codepoint <= 0x7F) { - // 1字节 - result.push_back(static_cast(codepoint)); - } else if (codepoint <= 0x7FF) { - // 2字节 - result.push_back(static_cast(0xC0 | ((codepoint >> 6) & 0x1F))); - result.push_back(static_cast(0x80 | (codepoint & 0x3F))); - } else if (codepoint <= 0xFFFF) { - // 3字节 - result.push_back(static_cast(0xE0 | ((codepoint >> 12) & 0x0F))); - result.push_back(static_cast(0x80 | ((codepoint >> 6) & 0x3F))); - result.push_back(static_cast(0x80 | (codepoint & 0x3F))); - } else if (codepoint <= 0x10FFFF) { - // 4字节 - result.push_back(static_cast(0xF0 | ((codepoint >> 18) & 0x07))); - result.push_back(static_cast(0x80 | ((codepoint >> 12) & 0x3F))); - result.push_back(static_cast(0x80 | ((codepoint >> 6) & 0x3F))); - result.push_back(static_cast(0x80 | (codepoint & 0x3F))); - } - // 跳过无效的码点 - } - - return result; -} } // anonymous namespace diff --git a/src/widget/widgets/text_widget.h b/src/widget/widgets/text_widget.h index eebf302..0294486 100644 --- a/src/widget/widgets/text_widget.h +++ b/src/widget/widgets/text_widget.h @@ -5,6 +5,8 @@ #include #include +#include "str_utils.h" + namespace mirage { /// @brief 文本控件 - 显示文本内容 class text_widget : public leaf_widget_base { @@ -30,7 +32,7 @@ namespace mirage { const auto& text = text_.get(); if (!text.empty()) { // 转换UTF-8到UTF-32 - auto utf32_text = render::text::text_shaper::utf8_to_utf32(text); + auto utf32_text = utf8_to_utf32(text); // 使用text_shaper进行精确度量 auto metrics = ctx->get_text_shaper().measure_text(utf32_text, font_id_.get(), font_size_.get());