paper-burner/js/utils/text-fitting.js

474 lines
14 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// utils/text-fitting.js
// 文本自适应算法 - 为 PDF 保留格式翻译优化
/**
* 文本自适应引擎
*
* 核心功能:
* 1. 自动计算最优字体大小,让翻译文本完美适配原始 bbox
* 2. 智能换行和行距控制
* 3. CJK 和西文混排优化
* 4. 空间不足时自动缩放或扩展容器
*
* 设计原则:
* - 渐进式缩放搜索(从 100% 开始逐步缩小)
* - 全局一致性(使用统计方法统一字体大小)
* - 智能空间扩展(优先向下,次选向右)
*/
class TextFittingEngine {
constructor(options = {}) {
// 核心参数
this.INITIAL_SCALE = options.initialScale || 1.0;
this.MIN_SCALE = options.minScale || 0.1;
this.SCALE_STEP_HIGH = options.scaleStepHigh || 0.05; // >0.6 时的步长
this.SCALE_STEP_LOW = options.scaleStepLow || 0.1; // <0.6 时的步长
this.EXPAND_THRESHOLD = options.expandThreshold || 0.7; // 触发空间扩展的阈值
// 行距配置(根据排版规范优化)
this.LINE_SKIP_CJK = options.lineSkipCJK || 1.30; // 降低中文行距,更紧凑
this.LINE_SKIP_WESTERN = options.lineSkipWestern || 1.20; // 降低西文行距
this.MIN_LINE_HEIGHT = options.minLineHeight || 1.05;
// 间距配置
this.CJK_SPACE_WIDTH_RATIO = options.cjkSpaceRatio || 0.5;
this.MIXED_LANG_SPACE_RATIO = options.mixedLangSpaceRatio || 0.5;
this.FIRST_LINE_INDENT_SPACES = options.firstLineIndent || 4;
// 扩展边距
this.BOTTOM_EXPAND_MARGIN = options.bottomExpandMargin || 2;
this.RIGHT_EXPAND_MARGIN = options.rightExpandMargin || -5;
// Canvas 上下文(用于精确测量文本宽度)
this._measureCanvas = null;
this._measureContext = null;
}
/**
* 为单个段落计算最优缩放因子
*
* @param {string} text - 翻译后的文本
* @param {Object} bbox - 边界框 [x0, y0, x1, y1]
* @param {number} originalFontSize - 原始字体大小
* @param {string} fontFamily - 字体族
* @param {boolean} isCJK - 是否为 CJK 语言
* @param {Object} options - 额外选项
* @returns {Object} { scale, reason, fitsWithoutExpansion }
*/
calculateOptimalScale(text, bbox, originalFontSize, fontFamily = 'Arial', isCJK = false, options = {}) {
if (!text || !bbox || bbox.length < 4) {
return { scale: 1.0, reason: 'invalid_input', fitsWithoutExpansion: true };
}
const [x0, y0, x1, y1] = bbox;
const availableWidth = x1 - x0;
const availableHeight = y1 - y0;
if (availableWidth <= 0 || availableHeight <= 0) {
return { scale: 1.0, reason: 'invalid_bbox', fitsWithoutExpansion: true };
}
// 获取行距倍数
const lineSkip = isCJK ? this.LINE_SKIP_CJK : this.LINE_SKIP_WESTERN;
let currentScale = this.INITIAL_SCALE;
// 渐进式搜索最优缩放
while (currentScale >= this.MIN_SCALE) {
const scaledFontSize = originalFontSize * currentScale;
const layout = this._layoutText(
text,
availableWidth,
availableHeight,
scaledFontSize,
fontFamily,
lineSkip,
isCJK,
options
);
// 如果所有文本都放得下
if (layout.fitsCompletely) {
return {
scale: currentScale,
reason: 'fits_perfectly',
fitsWithoutExpansion: true,
lineCount: layout.lineCount,
actualHeight: layout.actualHeight
};
}
// 减小缩放因子
if (currentScale > 0.6) {
currentScale -= this.SCALE_STEP_HIGH;
} else {
currentScale -= this.SCALE_STEP_LOW;
}
}
// 无法适配,返回最小缩放
return {
scale: this.MIN_SCALE,
reason: 'requires_expansion',
fitsWithoutExpansion: false,
requiresExpansion: true
};
}
/**
* 布局文本(模拟排版)
*
* @private
* @param {string} text
* @param {number} maxWidth
* @param {number} maxHeight
* @param {number} fontSize
* @param {string} fontFamily
* @param {number} lineSkip
* @param {boolean} isCJK
* @param {Object} options
* @returns {Object} { fitsCompletely, lineCount, actualHeight }
*/
_layoutText(text, maxWidth, maxHeight, fontSize, fontFamily, lineSkip, isCJK, options = {}) {
const lines = [];
const words = this._tokenizeText(text, isCJK);
let currentLine = '';
let currentWidth = 0;
const spaceWidth = this._measureTextWidth(' ', fontSize, fontFamily);
const cjkSpaceWidth = fontSize * this.CJK_SPACE_WIDTH_RATIO;
// 首行缩进
if (options.firstLineIndent) {
currentWidth = cjkSpaceWidth * this.FIRST_LINE_INDENT_SPACES;
}
for (let i = 0; i < words.length; i++) {
const word = words[i];
// 处理换行符:强制换行(和 wrapText 保持一致)
if (word === '\n') {
if (currentLine) {
lines.push(currentLine);
currentLine = '';
currentWidth = 0;
}
continue;
}
const wordWidth = this._measureTextWidth(word, fontSize, fontFamily);
// 标点符号:直接加到当前行(和 wrapText 保持一致)
if (isCJK && /^[。?!,、;:]$/.test(word)) {
currentLine += word;
currentWidth += wordWidth;
continue;
}
// 中英文混排:添加间距
const lastChar = currentLine.slice(-1);
const needsMixedSpace = lastChar &&
this._isCJKChar(lastChar) !== this._isCJKChar(word[0]);
const mixedSpaceWidth = needsMixedSpace ? (spaceWidth * this.MIXED_LANG_SPACE_RATIO) : 0;
// 检查是否需要换行
const totalWidth = currentWidth + mixedSpaceWidth + wordWidth;
if (totalWidth > maxWidth && currentLine.length > 0) {
lines.push(currentLine);
currentLine = word;
currentWidth = wordWidth;
} else {
if (needsMixedSpace) {
currentWidth += mixedSpaceWidth;
}
currentLine += word;
currentWidth += wordWidth;
}
}
// 添加最后一行
if (currentLine) {
lines.push(currentLine);
}
// 计算总高度
const lineHeight = fontSize * lineSkip;
const actualHeight = lines.length * lineHeight;
return {
fitsCompletely: actualHeight <= maxHeight,
lineCount: lines.length,
actualHeight: actualHeight,
lines: lines
};
}
/**
* 分词(支持 CJK 和西文)
*
* 重要:这个方法的分词逻辑必须和 history_pdf_compare.js 中的 wrapText() 保持一致!
*
* @private
* @param {string} text
* @param {boolean} isCJK
* @returns {Array<string>}
*/
_tokenizeText(text, isCJK) {
if (!text) return [];
if (isCJK) {
// CJK按标点符号分段然后每个字符作为一个单元
// 这和 wrapText() 的逻辑保持一致
const tokens = [];
const segments = text.split(/([。?!,、;:\n])/);
for (let segment of segments) {
if (!segment) continue;
// 标点符号作为独立 token
if (/^[。?!,、;:]$/.test(segment)) {
tokens.push(segment);
} else if (segment === '\n') {
tokens.push('\n'); // 换行符作为独立 token
} else {
// 其他字符逐个分割
tokens.push(...segment.split(''));
}
}
return tokens;
} else {
// 西文:按空格和标点分词
return text.match(/\S+|\s+/g) || [];
}
}
/**
* 判断是否为 CJK 字符
*
* @private
* @param {string} char
* @returns {boolean}
*/
_isCJKChar(char) {
if (!char || char.length === 0) return false;
const code = char.charCodeAt(0);
return (
(code >= 0x4E00 && code <= 0x9FFF) || // CJK Unified Ideographs
(code >= 0x3400 && code <= 0x4DBF) || // CJK Extension A
(code >= 0x20000 && code <= 0x2A6DF) || // CJK Extension B
(code >= 0x3000 && code <= 0x303F) || // CJK Symbols and Punctuation
(code >= 0xFF00 && code <= 0xFFEF) || // Fullwidth Forms
(code >= 0xAC00 && code <= 0xD7AF) || // Hangul Syllables
(code >= 0x3040 && code <= 0x309F) || // Hiragana
(code >= 0x30A0 && code <= 0x30FF) // Katakana
);
}
/**
* 测量文本宽度(使用 Canvas
*
* @private
* @param {string} text
* @param {number} fontSize
* @param {string} fontFamily
* @returns {number}
*/
_measureTextWidth(text, fontSize, fontFamily) {
if (!this._measureContext) {
this._measureCanvas = document.createElement('canvas');
this._measureContext = this._measureCanvas.getContext('2d');
}
this._measureContext.font = `${fontSize}px ${fontFamily}`;
return this._measureContext.measureText(text).width;
}
/**
* 批量计算最优缩放(全局一致性)
*
* 实现策略:
* 1. 计算每个段落的最优缩放
* 2. 使用众数作为全局缩放
* 3. 统一所有段落的缩放
*
* @param {Array<Object>} items - content_list.json 的项数组
* @param {string} fontFamily
* @param {string} targetLang
* @returns {Object} { globalScale, itemScales }
*/
calculateGlobalScale(items, fontFamily = 'Arial', targetLang = 'zh-CN') {
const isCJK = this._isTargetLangCJK(targetLang);
const scales = [];
const itemScales = [];
for (const item of items) {
if (item.type !== 'text' || !item.text || !item.bbox) {
itemScales.push(null);
continue;
}
// 估算原始字体大小(基于 bbox 高度)
const bboxHeight = item.bbox[3] - item.bbox[1];
const estimatedFontSize = bboxHeight * 0.8; // 经验值bbox 高度的 80%
const result = this.calculateOptimalScale(
item.text,
item.bbox,
estimatedFontSize,
fontFamily,
isCJK
);
scales.push(result.scale);
itemScales.push(result);
}
// 计算众数(最常见的缩放因子)
const globalScale = this._calculateMode(scales);
// 统一所有段落的缩放(大于众数的降为众数)
for (let i = 0; i < itemScales.length; i++) {
if (itemScales[i] && itemScales[i].scale > globalScale) {
itemScales[i].scale = globalScale;
itemScales[i].reason = 'global_consistency';
}
}
return { globalScale, itemScales };
}
/**
* 计算众数
*
* @private
* @param {Array<number>} values
* @returns {number}
*/
_calculateMode(values) {
if (!values || values.length === 0) return 1.0;
const frequency = {};
let maxFreq = 0;
let mode = values[0];
for (const value of values) {
if (value == null) continue;
const rounded = Math.round(value * 100) / 100; // 保留两位小数
frequency[rounded] = (frequency[rounded] || 0) + 1;
if (frequency[rounded] > maxFreq) {
maxFreq = frequency[rounded];
mode = rounded;
}
}
return mode;
}
/**
* 判断目标语言是否为 CJK
*
* @private
* @param {string} targetLang
* @returns {boolean}
*/
_isTargetLangCJK(targetLang) {
if (!targetLang) return false;
const upper = targetLang.toUpperCase();
return upper.includes('ZH') ||
upper.includes('JA') ||
upper.includes('JP') ||
upper.includes('KO') ||
upper.includes('KR');
}
/**
* 智能扩展 bbox当缩放无法解决时
*
* @param {Array} bbox - [x0, y0, x1, y1]
* @param {Array<Object>} allItems - 所有段落项(用于检测障碍物)
* @param {number} pageWidth
* @param {number} pageHeight
* @returns {Array} 扩展后的 bbox
*/
expandBbox(bbox, allItems, pageWidth, pageHeight) {
const [x0, y0, x1, y1] = bbox;
let expandedBbox = [...bbox];
let expanded = false;
// 策略1向下扩展
const bottomSpace = this._getMaxBottomSpace(bbox, allItems, pageHeight);
if (bottomSpace > y0) {
expandedBbox[1] = bottomSpace + this.BOTTOM_EXPAND_MARGIN;
expanded = true;
console.log(`[TextFitting] 向下扩展: ${y0} -> ${expandedBbox[1]}`);
}
// 策略2向右扩展如果向下不够
if (!expanded) {
const rightSpace = this._getMaxRightSpace(bbox, allItems, pageWidth);
if (rightSpace > x1) {
expandedBbox[2] = rightSpace + this.RIGHT_EXPAND_MARGIN;
expanded = true;
console.log(`[TextFitting] 向右扩展: ${x1} -> ${expandedBbox[2]}`);
}
}
return expandedBbox;
}
/**
* 获取下方最大可用空间
*
* @private
*/
_getMaxBottomSpace(bbox, allItems, pageHeight) {
const [x0, y0, x1, y1] = bbox;
let minY = pageHeight * 0.1; // 页面底部 10% 作为最小限制
for (const item of allItems) {
if (!item.bbox || item.bbox === bbox) continue;
const [ix0, iy0, ix1, iy1] = item.bbox;
// 检查是否在当前 bbox 下方且有水平重叠
const hasHorizontalOverlap = !(ix1 <= x0 || ix0 >= x1);
if (iy1 < y0 && hasHorizontalOverlap) {
minY = Math.max(minY, iy1);
}
}
return minY;
}
/**
* 获取右侧最大可用空间
*
* @private
*/
_getMaxRightSpace(bbox, allItems, pageWidth) {
const [x0, y0, x1, y1] = bbox;
let maxX = pageWidth * 0.9; // 页面右侧 10% 作为最大限制
for (const item of allItems) {
if (!item.bbox || item.bbox === bbox) continue;
const [ix0, iy0, ix1, iy1] = item.bbox;
// 检查是否在当前 bbox 右侧且有垂直重叠
const hasVerticalOverlap = !(iy1 <= y0 || iy0 >= y1);
if (ix0 > x0 && hasVerticalOverlap) {
maxX = Math.min(maxX, ix0);
}
}
return maxX;
}
}
// 导出到全局
if (typeof window !== 'undefined') {
window.TextFittingEngine = TextFittingEngine;
}
// 模块化导出
if (typeof module !== 'undefined' && module.exports) {
module.exports = TextFittingEngine;
}