feat: 添加 UTF-8 和 UTF-32 转换功能,重构相关代码以使用新的 str_utils.h 实现

This commit is contained in:
2025-12-13 13:18:30 +08:00
parent 702629bac3
commit abcbfd7d0c
6 changed files with 156 additions and 185 deletions

149
src/common/str_utils.h Normal file
View File

@@ -0,0 +1,149 @@
#pragma once
#include <string>
#include <span>
#include <expected>
#include <array>
#include <ranges>
#include <utility>
namespace str_utils {
/// @brief UTF-8 序列模式定义
struct utf8_pattern {
uint8_t mask;
uint8_t pattern;
uint8_t data_mask;
size_t length;
};
/// @brief UTF-8 编码模式表(编译期常量)
inline constexpr std::array<utf8_pattern, 4> utf8_patterns{
{
{0x80, 0x00, 0x7F, 1}, // 1字节: 0xxxxxxx
{0xE0, 0xC0, 0x1F, 2}, // 2字节: 110xxxxx
{0xF0, 0xE0, 0x0F, 3}, // 3字节: 1110xxxx
{0xF8, 0xF0, 0x07, 4} // 4字节: 11110xxx
}
};
/// @brief 解码错误类型
struct decode_error {
size_t bytes_to_skip; // 需要跳过的字节数
};
/// @brief 解码成功结果
struct decode_result {
char32_t codepoint;
size_t bytes_consumed;
};
/// @brief 解码单个 UTF-8 字符
/// @param bytes 输入字节序列
/// @return 成功时返回码点和消耗字节数,失败时返回需要跳过的字节数
[[nodiscard]] constexpr auto decode_utf8_char(std::span<const uint8_t> bytes)
-> std::expected<decode_result, decode_error> {
if (bytes.empty()) [[unlikely]] {
return std::unexpected{decode_error{0}};
}
const uint8_t first = bytes[0];
// 使用 ranges::find_if 查找匹配的模式
auto match = std::ranges::find_if(utf8_patterns, [first](const auto& p) {
return (first & p.mask) == p.pattern;
});
if (match == utf8_patterns.end()) [[unlikely]] {
return std::unexpected{decode_error{1}};
}
const auto& [mask, pattern, data_mask, length] = *match;
if (bytes.size() < length) [[unlikely]] {
return std::unexpected{decode_error{0}};
}
char32_t codepoint = first & data_mask;
// 使用 ranges::subrange 处理后续字节
auto continuation_bytes = bytes.subspan(1, length - 1);
for (const auto byte : continuation_bytes) {
if ((byte & 0xC0) != 0x80) [[unlikely]] {
return std::unexpected{decode_error{0}};
}
codepoint = (codepoint << 6) | (byte & 0x3F);
}
return decode_result{codepoint, length};
}
/// @brief 编码单个 UTF-32 码点为 UTF-8
/// @param codepoint UTF-32 码点
/// @param output 输出字符串引用
constexpr void encode_utf8_char(char32_t codepoint, std::string& output) {
if (codepoint <= 0x7F) {
output.push_back(static_cast<char>(codepoint));
}
else if (codepoint <= 0x7FF) {
output.push_back(static_cast<char>(0xC0 | ((codepoint >> 6) & 0x1F)));
output.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
}
else if (codepoint <= 0xFFFF) {
output.push_back(static_cast<char>(0xE0 | ((codepoint >> 12) & 0x0F)));
output.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
output.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
}
else if (codepoint <= 0x10FFFF) [[likely]] {
output.push_back(static_cast<char>(0xF0 | ((codepoint >> 18) & 0x07)));
output.push_back(static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F)));
output.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
output.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
}
// 超出 Unicode 范围的码点被静默跳过
}
} // namespace str_utils
/// @brief 将 UTF-8 字符串转换为 UTF-32
[[nodiscard]] inline auto utf8_to_utf32(std::string_view utf8) -> std::u32string {
using namespace str_utils;
std::u32string result;
result.reserve(utf8.size()); // 最坏情况预留
const std::span bytes{
reinterpret_cast<const uint8_t*>(utf8.data()),
utf8.size()
};
for (size_t pos = 0; pos < bytes.size();) {
if (auto decoded = decode_utf8_char(bytes.subspan(pos))) [[likely]] {
result.push_back(decoded->codepoint);
pos += decoded->bytes_consumed;
}
else {
// 跳过无效字节
const auto skip = decoded.error().bytes_to_skip;
pos += (skip > 0 ? skip : 1);
}
}
return result;
}
/// @brief 将 UTF-32 字符串转换为 UTF-8
[[nodiscard]] inline auto utf32_to_utf8(std::u32string_view utf32) -> std::string {
using namespace str_utils;
if (utf32.empty()) {
return {};
}
std::string result;
result.reserve(utf32.size() * 2); // 预留空间通常UTF-8会更长
// 使用 ranges::for_each 遍历并编码每个码点
std::ranges::for_each(utf32, [&result](char32_t cp) {
encode_utf8_char(cp, result);
});
return result;
}

View File

@@ -6,6 +6,7 @@
#include <variant>
#include <optional>
#include <iostream>
#include "str_utils.h"
namespace mirage::render {
@@ -194,7 +195,7 @@ namespace {
}
else if constexpr (std::is_same_v<CmdType, text_command>) {
// 转换UTF-8文本为UTF-32
auto utf32_text = text::text_shaper::utf8_to_utf32(concrete_cmd.text);
auto utf32_text = utf8_to_utf32(concrete_cmd.text);
// 使用text_shaper生成文本顶点数据
auto shaped = text_shaper.shape_text(
@@ -224,7 +225,7 @@ namespace {
}
else if constexpr (std::is_same_v<CmdType, text_effect_command>) {
// 转换UTF-8文本为UTF-32
auto utf32_text = text::text_shaper::utf8_to_utf32(concrete_cmd.text);
auto utf32_text = utf8_to_utf32(concrete_cmd.text);
// 使用text_shaper生成文本顶点数据
auto shaped = text_shaper.shape_text(

View File

@@ -215,88 +215,4 @@ namespace mirage::render::text {
.height = max_y - min_y
};
}
auto text_shaper::utf8_to_utf32(std::string_view utf8) -> std::u32string {
std::u32string result;
result.reserve(utf8.size()); // 最坏情况预留
// UTF-8 解码辅助结构
struct utf8_decode_result {
char32_t codepoint;
size_t bytes_consumed;
bool valid;
};
// 使用 constexpr lambda 进行 UTF-8 解码
constexpr auto decode_utf8_char = [](std::span<const uint8_t> bytes) -> utf8_decode_result {
if (bytes.empty()) [[unlikely]] {
return {0, 0, false};
}
const uint8_t first = bytes[0];
// 使用位掩码模式匹配
struct sequence_pattern {
uint8_t mask;
uint8_t pattern;
uint8_t data_mask;
size_t length;
};
constexpr std::array<sequence_pattern, 4> patterns = {
{
{0x80, 0x00, 0x7F, 1}, // 1字节: 0xxxxxxx
{0xE0, 0xC0, 0x1F, 2}, // 2字节: 110xxxxx
{0xF0, 0xE0, 0x0F, 3}, // 3字节: 1110xxxx
{0xF8, 0xF0, 0x07, 4} // 4字节: 11110xxx
}
};
// 查找匹配的模式
for (const auto& [mask, pattern, data_mask, length] : patterns) {
if ((first & mask) == pattern) {
if (bytes.size() < length) [[unlikely]] {
return {0, 0, false};
}
char32_t codepoint = first & data_mask;
// 读取后续字节continuation bytes: 10xxxxxx
for (size_t j = 1; j < length; ++j) {
if ((bytes[j] & 0xC0) != 0x80) [[unlikely]] {
return {0, 0, false};
}
codepoint = (codepoint << 6) | (bytes[j] & 0x3F);
}
return {codepoint, length, true};
}
}
// 无效序列
return {0, 1, false};
};
// 使用 span 安全地处理字节序列
const std::span bytes{
reinterpret_cast<const uint8_t*>(utf8.data()),
utf8.size()
};
size_t pos = 0;
while (pos < bytes.size()) {
const auto [codepoint, consumed, valid] = decode_utf8_char(bytes.subspan(pos));
if (valid && consumed > 0) [[likely]] {
result.push_back(codepoint);
pos += consumed;
}
else {
// 跳过无效字节
pos += (consumed > 0 ? consumed : 1);
}
}
return result;
}
} // namespace mirage::render::text

View File

@@ -44,10 +44,6 @@ public:
font_manager::font_id_t font_id,
float font_size
) -> text_metrics;
// UTF-8转UTF-32
static auto utf8_to_utf32(std::string_view utf8) -> std::u32string;
private:
font_manager& font_mgr_;
glyph_cache& cache_;

View File

@@ -3,6 +3,7 @@
#include <ranges>
#include <codecvt>
#include <locale>
#include "str_utils.h"
namespace mirage {
@@ -12,100 +13,6 @@ namespace mirage {
namespace {
/// @brief 将 UTF-8 字符串转换为 UTF-32
[[nodiscard]] std::u32string utf8_to_utf32(std::string_view utf8) {
if (utf8.empty()) {
return {};
}
std::u32string result;
result.reserve(utf8.size()); // 预留空间通常UTF-32会更短
size_t i = 0;
while (i < utf8.size()) {
char32_t codepoint = 0;
unsigned char byte = static_cast<unsigned char>(utf8[i]);
if (byte <= 0x7F) {
// 1字节序列 (ASCII)
codepoint = byte;
i += 1;
} else if ((byte & 0xE0) == 0xC0) {
// 2字节序列
if (i + 1 < utf8.size()) {
codepoint = ((byte & 0x1F) << 6) |
(utf8[i + 1] & 0x3F);
i += 2;
} else {
break; // 不完整的序列
}
} else if ((byte & 0xF0) == 0xE0) {
// 3字节序列
if (i + 2 < utf8.size()) {
codepoint = ((byte & 0x0F) << 12) |
((utf8[i + 1] & 0x3F) << 6) |
(utf8[i + 2] & 0x3F);
i += 3;
} else {
break;
}
} else if ((byte & 0xF8) == 0xF0) {
// 4字节序列
if (i + 3 < utf8.size()) {
codepoint = ((byte & 0x07) << 18) |
((utf8[i + 1] & 0x3F) << 12) |
((utf8[i + 2] & 0x3F) << 6) |
(utf8[i + 3] & 0x3F);
i += 4;
} else {
break;
}
} else {
// 无效的UTF-8序列跳过
i += 1;
continue;
}
result.push_back(codepoint);
}
return result;
}
/// @brief 将 UTF-32 字符串转换为 UTF-8
[[nodiscard]] std::string utf32_to_utf8(std::u32string_view utf32) {
if (utf32.empty()) {
return {};
}
std::string result;
result.reserve(utf32.size() * 2); // 预留空间通常UTF-8会更长
for (char32_t codepoint : utf32) {
if (codepoint <= 0x7F) {
// 1字节
result.push_back(static_cast<char>(codepoint));
} else if (codepoint <= 0x7FF) {
// 2字节
result.push_back(static_cast<char>(0xC0 | ((codepoint >> 6) & 0x1F)));
result.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
} else if (codepoint <= 0xFFFF) {
// 3字节
result.push_back(static_cast<char>(0xE0 | ((codepoint >> 12) & 0x0F)));
result.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
result.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
} else if (codepoint <= 0x10FFFF) {
// 4字节
result.push_back(static_cast<char>(0xF0 | ((codepoint >> 18) & 0x07)));
result.push_back(static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F)));
result.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
result.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
}
// 跳过无效的码点
}
return result;
}
} // anonymous namespace

View File

@@ -5,6 +5,8 @@
#include <string>
#include <string_view>
#include "str_utils.h"
namespace mirage {
/// @brief 文本控件 - 显示文本内容
class text_widget : public leaf_widget_base {
@@ -30,7 +32,7 @@ namespace mirage {
const auto& text = text_.get();
if (!text.empty()) {
// 转换UTF-8到UTF-32
auto utf32_text = render::text::text_shaper::utf8_to_utf32(text);
auto utf32_text = utf8_to_utf32(text);
// 使用text_shaper进行精确度量
auto metrics = ctx->get_text_shaper().measure_text(utf32_text, font_id_.get(), font_size_.get());