feat: 添加 UTF-8 和 UTF-32 转换功能，重构相关代码以使用新的 str_utils.h 实现

2025-12-13 13:18:30 +08:00
parent 702629bac3
commit abcbfd7d0c
6 changed files with 156 additions and 185 deletions
--- a/src/common/str_utils.h
+++ b/src/common/str_utils.h
@@ -0,0 +1,149 @@
+#pragma once
+#include <string>
+#include <span>
+#include <expected>
+#include <array>
+#include <ranges>
+#include <utility>
+
+namespace str_utils {
+	/// @brief UTF-8 序列模式定义
+	struct utf8_pattern {
+		uint8_t mask;
+		uint8_t pattern;
+		uint8_t data_mask;
+		size_t  length;
+	};
+
+	/// @brief UTF-8 编码模式表（编译期常量）
+	inline constexpr std::array<utf8_pattern, 4> utf8_patterns{
+		{
+			{0x80, 0x00, 0x7F, 1}, // 1字节: 0xxxxxxx
+			{0xE0, 0xC0, 0x1F, 2}, // 2字节: 110xxxxx
+			{0xF0, 0xE0, 0x0F, 3}, // 3字节: 1110xxxx
+			{0xF8, 0xF0, 0x07, 4}  // 4字节: 11110xxx
+		}
+	};
+
+	/// @brief 解码错误类型
+	struct decode_error {
+		size_t bytes_to_skip; // 需要跳过的字节数
+	};
+
+	/// @brief 解码成功结果
+	struct decode_result {
+		char32_t codepoint;
+		size_t   bytes_consumed;
+	};
+
+	/// @brief 解码单个 UTF-8 字符
+	/// @param bytes 输入字节序列
+	/// @return 成功时返回码点和消耗字节数，失败时返回需要跳过的字节数
+	[[nodiscard]] constexpr auto decode_utf8_char(std::span<const uint8_t> bytes)
+		-> std::expected<decode_result, decode_error> {
+		if (bytes.empty()) [[unlikely]] {
+			return std::unexpected{decode_error{0}};
+		}
+
+		const uint8_t first = bytes[0];
+
+		// 使用 ranges::find_if 查找匹配的模式
+		auto match = std::ranges::find_if(utf8_patterns, [first](const auto& p) {
+			return (first & p.mask) == p.pattern;
+		});
+
+		if (match == utf8_patterns.end()) [[unlikely]] {
+			return std::unexpected{decode_error{1}};
+		}
+
+		const auto& [mask, pattern, data_mask, length] = *match;
+
+		if (bytes.size() < length) [[unlikely]] {
+			return std::unexpected{decode_error{0}};
+		}
+
+		char32_t codepoint = first & data_mask;
+
+		// 使用 ranges::subrange 处理后续字节
+		auto continuation_bytes = bytes.subspan(1, length - 1);
+		for (const auto byte : continuation_bytes) {
+			if ((byte & 0xC0) != 0x80) [[unlikely]] {
+				return std::unexpected{decode_error{0}};
+			}
+			codepoint = (codepoint << 6) | (byte & 0x3F);
+		}
+
+		return decode_result{codepoint, length};
+	}
+
+	/// @brief 编码单个 UTF-32 码点为 UTF-8
+	/// @param codepoint UTF-32 码点
+	/// @param output 输出字符串引用
+	constexpr void encode_utf8_char(char32_t codepoint, std::string& output) {
+		if (codepoint <= 0x7F) {
+			output.push_back(static_cast<char>(codepoint));
+		}
+		else if (codepoint <= 0x7FF) {
+			output.push_back(static_cast<char>(0xC0 | ((codepoint >> 6) & 0x1F)));
+			output.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
+		}
+		else if (codepoint <= 0xFFFF) {
+			output.push_back(static_cast<char>(0xE0 | ((codepoint >> 12) & 0x0F)));
+			output.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
+			output.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
+		}
+		else if (codepoint <= 0x10FFFF) [[likely]] {
+			output.push_back(static_cast<char>(0xF0 | ((codepoint >> 18) & 0x07)));
+			output.push_back(static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F)));
+			output.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
+			output.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
+		}
+		// 超出 Unicode 范围的码点被静默跳过
+	}
+} // namespace str_utils
+
+/// @brief 将 UTF-8 字符串转换为 UTF-32
+[[nodiscard]] inline auto utf8_to_utf32(std::string_view utf8) -> std::u32string {
+	using namespace str_utils;
+
+	std::u32string result;
+	result.reserve(utf8.size()); // 最坏情况预留
+
+	const std::span bytes{
+		reinterpret_cast<const uint8_t*>(utf8.data()),
+		utf8.size()
+	};
+
+	for (size_t pos = 0; pos < bytes.size();) {
+		if (auto decoded = decode_utf8_char(bytes.subspan(pos))) [[likely]] {
+			result.push_back(decoded->codepoint);
+			pos += decoded->bytes_consumed;
+		}
+		else {
+			// 跳过无效字节
+			const auto skip = decoded.error().bytes_to_skip;
+			pos += (skip > 0 ? skip : 1);
+		}
+	}
+
+	return result;
+}
+
+/// @brief 将 UTF-32 字符串转换为 UTF-8
+[[nodiscard]] inline auto utf32_to_utf8(std::u32string_view utf32) -> std::string {
+	using namespace str_utils;
+
+	if (utf32.empty()) {
+		return {};
+	}
+
+	std::string result;
+	result.reserve(utf32.size() * 2); // 预留空间（通常UTF-8会更长）
+
+	// 使用 ranges::for_each 遍历并编码每个码点
+	std::ranges::for_each(utf32, [&result](char32_t cp) {
+		encode_utf8_char(cp, result);
+	});
+
+	return result;
+}
--- a/src/render/pipeline/render_tree_builder.cpp
+++ b/src/render/pipeline/render_tree_builder.cpp
@@ -6,6 +6,7 @@
 #include <variant>
 #include <optional>
 #include <iostream>
+#include "str_utils.h"

 namespace mirage::render {

@@ -194,7 +195,7 @@ namespace {
 			}
 			else if constexpr (std::is_same_v<CmdType, text_command>) {
 				// 转换UTF-8文本为UTF-32
-				auto utf32_text = text::text_shaper::utf8_to_utf32(concrete_cmd.text);
+				auto utf32_text = utf8_to_utf32(concrete_cmd.text);
 				
 				// 使用text_shaper生成文本顶点数据
 				auto shaped = text_shaper.shape_text(
@@ -224,7 +225,7 @@ namespace {
 			}
 			else if constexpr (std::is_same_v<CmdType, text_effect_command>) {
 				// 转换UTF-8文本为UTF-32
-				auto utf32_text = text::text_shaper::utf8_to_utf32(concrete_cmd.text);
+				auto utf32_text = utf8_to_utf32(concrete_cmd.text);
 				
 				// 使用text_shaper生成文本顶点数据
 				auto shaped = text_shaper.shape_text(
--- a/src/render/text/text_shaper.cpp
+++ b/src/render/text/text_shaper.cpp
@@ -215,88 +215,4 @@ namespace mirage::render::text {
 			.height = max_y - min_y
 		};
 	}
-
-	auto text_shaper::utf8_to_utf32(std::string_view utf8) -> std::u32string {
-		std::u32string result;
-		result.reserve(utf8.size()); // 最坏情况预留
-
-		// UTF-8 解码辅助结构
-		struct utf8_decode_result {
-			char32_t codepoint;
-			size_t   bytes_consumed;
-			bool     valid;
-		};
-
-		// 使用 constexpr lambda 进行 UTF-8 解码
-		constexpr auto decode_utf8_char = [](std::span<const uint8_t> bytes) -> utf8_decode_result {
-			if (bytes.empty()) [[unlikely]] {
-				return {0, 0, false};
-			}
-
-			const uint8_t first = bytes[0];
-
-			// 使用位掩码模式匹配
-			struct sequence_pattern {
-				uint8_t mask;
-				uint8_t pattern;
-				uint8_t data_mask;
-				size_t  length;
-			};
-
-			constexpr std::array<sequence_pattern, 4> patterns = {
-				{
-					{0x80, 0x00, 0x7F, 1}, // 1字节: 0xxxxxxx
-					{0xE0, 0xC0, 0x1F, 2}, // 2字节: 110xxxxx
-					{0xF0, 0xE0, 0x0F, 3}, // 3字节: 1110xxxx
-					{0xF8, 0xF0, 0x07, 4}  // 4字节: 11110xxx
-				}
-			};
-
-			// 查找匹配的模式
-			for (const auto& [mask, pattern, data_mask, length] : patterns) {
-				if ((first & mask) == pattern) {
-					if (bytes.size() < length) [[unlikely]] {
-						return {0, 0, false};
-					}
-
-					char32_t codepoint = first & data_mask;
-
-					// 读取后续字节（continuation bytes: 10xxxxxx）
-					for (size_t j = 1; j < length; ++j) {
-						if ((bytes[j] & 0xC0) != 0x80) [[unlikely]] {
-							return {0, 0, false};
-						}
-						codepoint = (codepoint << 6) | (bytes[j] & 0x3F);
-					}
-
-					return {codepoint, length, true};
-				}
-			}
-
-			// 无效序列
-			return {0, 1, false};
-		};
-
-		// 使用 span 安全地处理字节序列
-		const std::span bytes{
-			reinterpret_cast<const uint8_t*>(utf8.data()),
-			utf8.size()
-		};
-
-		size_t pos = 0;
-		while (pos < bytes.size()) {
-			const auto [codepoint, consumed, valid] = decode_utf8_char(bytes.subspan(pos));
-
-			if (valid && consumed > 0) [[likely]] {
-				result.push_back(codepoint);
-				pos += consumed;
-			}
-			else {
-				// 跳过无效字节
-				pos += (consumed > 0 ? consumed : 1);
-			}
-		}
-
-		return result;
-	}
 } // namespace mirage::render::text
--- a/src/render/text/text_shaper.h
+++ b/src/render/text/text_shaper.h
@@ -44,10 +44,6 @@ public:
        font_manager::font_id_t font_id,
        float font_size
    ) -> text_metrics;
-    
-    // UTF-8转UTF-32
-    static auto utf8_to_utf32(std::string_view utf8) -> std::u32string;
-
 private:
    font_manager& font_mgr_;
    glyph_cache& cache_;
--- a/src/widget/widgets/text_input/text_model.cpp
+++ b/src/widget/widgets/text_input/text_model.cpp
@@ -3,6 +3,7 @@
 #include <ranges>
 #include <codecvt>
 #include <locale>
+#include "str_utils.h"

 namespace mirage {

@@ -12,100 +13,6 @@ namespace mirage {

 namespace {

-/// @brief 将 UTF-8 字符串转换为 UTF-32
-[[nodiscard]] std::u32string utf8_to_utf32(std::string_view utf8) {
-	if (utf8.empty()) {
-		return {};
-	}
-	
-	std::u32string result;
-	result.reserve(utf8.size()); // 预留空间（通常UTF-32会更短）
-	
-	size_t i = 0;
-	while (i < utf8.size()) {
-		char32_t codepoint = 0;
-		unsigned char byte = static_cast<unsigned char>(utf8[i]);
-		
-		if (byte <= 0x7F) {
-			// 1字节序列 (ASCII)
-			codepoint = byte;
-			i += 1;
-		} else if ((byte & 0xE0) == 0xC0) {
-			// 2字节序列
-			if (i + 1 < utf8.size()) {
-				codepoint = ((byte & 0x1F) << 6) |
-				           (utf8[i + 1] & 0x3F);
-				i += 2;
-			} else {
-				break; // 不完整的序列
-			}
-		} else if ((byte & 0xF0) == 0xE0) {
-			// 3字节序列
-			if (i + 2 < utf8.size()) {
-				codepoint = ((byte & 0x0F) << 12) |
-				           ((utf8[i + 1] & 0x3F) << 6) |
-				           (utf8[i + 2] & 0x3F);
-				i += 3;
-			} else {
-				break;
-			}
-		} else if ((byte & 0xF8) == 0xF0) {
-			// 4字节序列
-			if (i + 3 < utf8.size()) {
-				codepoint = ((byte & 0x07) << 18) |
-				           ((utf8[i + 1] & 0x3F) << 12) |
-				           ((utf8[i + 2] & 0x3F) << 6) |
-				           (utf8[i + 3] & 0x3F);
-				i += 4;
-			} else {
-				break;
-			}
-		} else {
-			// 无效的UTF-8序列，跳过
-			i += 1;
-			continue;
-		}
-		
-		result.push_back(codepoint);
-	}
-	
-	return result;
-}
-
-/// @brief 将 UTF-32 字符串转换为 UTF-8
-[[nodiscard]] std::string utf32_to_utf8(std::u32string_view utf32) {
-	if (utf32.empty()) {
-		return {};
-	}
-	
-	std::string result;
-	result.reserve(utf32.size() * 2); // 预留空间（通常UTF-8会更长）
-	
-	for (char32_t codepoint : utf32) {
-		if (codepoint <= 0x7F) {
-			// 1字节
-			result.push_back(static_cast<char>(codepoint));
-		} else if (codepoint <= 0x7FF) {
-			// 2字节
-			result.push_back(static_cast<char>(0xC0 | ((codepoint >> 6) & 0x1F)));
-			result.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
-		} else if (codepoint <= 0xFFFF) {
-			// 3字节
-			result.push_back(static_cast<char>(0xE0 | ((codepoint >> 12) & 0x0F)));
-			result.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
-			result.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
-		} else if (codepoint <= 0x10FFFF) {
-			// 4字节
-			result.push_back(static_cast<char>(0xF0 | ((codepoint >> 18) & 0x07)));
-			result.push_back(static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F)));
-			result.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
-			result.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
-		}
-		// 跳过无效的码点
-	}
-	
-	return result;
-}

 } // anonymous namespace

--- a/src/widget/widgets/text_widget.h
+++ b/src/widget/widgets/text_widget.h
@@ -5,6 +5,8 @@
 #include <string>
 #include <string_view>

+#include "str_utils.h"
+
 namespace mirage {
 	/// @brief 文本控件 - 显示文本内容
 	class text_widget : public leaf_widget_base {
@@ -30,7 +32,7 @@ namespace mirage {
 				const auto& text = text_.get();
 				if (!text.empty()) {
 					// 转换UTF-8到UTF-32
-					auto utf32_text = render::text::text_shaper::utf8_to_utf32(text);
+					auto utf32_text = utf8_to_utf32(text);

 					// 使用text_shaper进行精确度量
 					auto metrics = ctx->get_text_shaper().measure_text(utf32_text, font_id_.get(), font_size_.get());