deerflow2/src/crawler/readability_extractor.py

30 lines
866 B
Python

# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
import logging
from readabilipy import simple_json_from_html_string
from .article import Article
logger = logging.getLogger(__name__)
class ReadabilityExtractor:
def extract_article(self, html: str) -> Article:
article = simple_json_from_html_string(html, use_readability=True)
content = article.get("content")
if not content or not str(content).strip():
logger.warning("Readability extraction returned empty content")
content = "<p>No content could be extracted from this page</p>"
title = article.get("title")
if not title or not str(title).strip():
title = "Untitled"
return Article(
title=title,
html_content=content,
)