deerflow2/src/crawler/crawler.py

28 lines
990 B
Python

# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
from .article import Article
from .jina_client import JinaClient
from .readability_extractor import ReadabilityExtractor
class Crawler:
def crawl(self, url: str) -> Article:
# To help LLMs better understand content, we extract clean
# articles from HTML, convert them to markdown, and split
# them into text and image blocks for one single and unified
# LLM message.
#
# Jina is not the best crawler on readability, however it's
# much easier and free to use.
#
# Instead of using Jina's own markdown converter, we'll use
# our own solution to get better readability results.
jina_client = JinaClient()
html = jina_client.crawl(url, return_format="html")
extractor = ReadabilityExtractor()
article = extractor.extract_article(html)
article.url = url
return article