PDF export: sanitize HTML (remove <style>, style attributes, <script>) to avoid xhtml2pdf font-size error

This commit is contained in:
2025-09-04 08:54:48 +03:00
parent 2f031591bf
commit 361f9d0bbe

11
app.py
View File

@@ -16,6 +16,7 @@ from flask import (
Response, Response,
) )
from io import BytesIO from io import BytesIO
import re
from xhtml2pdf import pisa # type: ignore from xhtml2pdf import pisa # type: ignore
from docx import Document # type: ignore from docx import Document # type: ignore
from htmldocx import HtmlToDocx # type: ignore from htmldocx import HtmlToDocx # type: ignore
@@ -268,6 +269,13 @@ def create_app():
abort(404) abort(404)
return row return row
def _sanitize_html_for_pdf(html: str) -> str:
# xhtml2pdf плохо переносит современный CSS; вычищаем стили/скрипты
html = re.sub(r"<style[^>]*>.*?</style>", "", html, flags=re.I | re.S)
html = re.sub(r"\sstyle=(\"|\')(.*?)\1", "", html, flags=re.I | re.S)
html = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.I | re.S)
return html
def _wrap_html_for_export(title: str, html: str) -> str: def _wrap_html_for_export(title: str, html: str) -> str:
head_title = f"<title>{title}</title>" if title else "" head_title = f"<title>{title}</title>" if title else ""
return ( return (
@@ -280,7 +288,8 @@ def create_app():
def export_pdf(uid: str): def export_pdf(uid: str):
row = _fetch_page(uid) row = _fetch_page(uid)
title = row["title"] or f"page-{uid[:8]}" title = row["title"] or f"page-{uid[:8]}"
html_doc = _wrap_html_for_export(title, row["html"]) cleaned = _sanitize_html_for_pdf(row["html"])
html_doc = _wrap_html_for_export(title, cleaned)
out = BytesIO() out = BytesIO()
pisa.CreatePDF(src=html_doc, dest=out) pisa.CreatePDF(src=html_doc, dest=out)
out.seek(0) out.seek(0)