利用 Bun 内置 HTMLRewriter(基于 Cloudflare lol-html) 实现 HTML 过滤到纯文本

之前使用 happy-dom 经常出现 OOM 内存崩溃问题,改用了这个内置的组件。

当前版本(2025/03/18)代码实现如下,可以只取首行,限制最大字数。

export class HTMLText {
    private static stop: number;
    private static first: boolean;
    private static value: string;
    private static rewriter = new HTMLRewriter().on('*', {
        element: e => {
            if (this.stop == 2) { return; }
            if (['p', 'br', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'].includes(e.tagName)) {
                this.value += ' '
                // 如果只取首行 且遇到换行符 则标记预备停止
                if (this.first && !this.stop) {
                    this.stop = 1;
                    e.onEndTag(() => {
                        this.stop = 2;
                    })
                }
            }
        },
        text: t => {
            if (this.stop == 2) { return; }
            if (t.text) {
                this.value += t.text
                    .replace(/&/g, "&")
                    .replace(/&lt;/g, "<")
                    .replace(/&gt;/g, ">")
                    .replace(/&quot;/g, '"')
                    .replace(/&#39;/g, "'")
                    .replace(/&nbsp;/g, " ")
                    .trim()
            }
        }
    });
    public static run(html: BodyInit | null, len: number) {
        if (!html) { return '...' }
        this.stop = 0;
        this.value = '';
        this.rewriter.transform(new Response(html)).text();
        let text = this.value.trim();
        if (len > 0) {
            const lenOld = text.length
            if (lenOld > len) {
                text = text.slice(0, len - 3) + '...'
            }
        }
        return text
    }
    // 取首行
    public static one(html: BodyInit | null, len = 0) {
        this.first = true;
        return this.run(html, len)
    }
    // 取全文
    public static all(html: BodyInit | null, len = 0) {
        this.first = false;
        return this.run(html, len)
    }
}
1