28 lines
990 B
Python
28 lines
990 B
Python
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
|
# SPDX-License-Identifier: MIT
|
|
|
|
|
|
from .article import Article
|
|
from .jina_client import JinaClient
|
|
from .readability_extractor import ReadabilityExtractor
|
|
|
|
|
|
class Crawler:
|
|
def crawl(self, url: str) -> Article:
|
|
# To help LLMs better understand content, we extract clean
|
|
# articles from HTML, convert them to markdown, and split
|
|
# them into text and image blocks for one single and unified
|
|
# LLM message.
|
|
#
|
|
# Jina is not the best crawler on readability, however it's
|
|
# much easier and free to use.
|
|
#
|
|
# Instead of using Jina's own markdown converter, we'll use
|
|
# our own solution to get better readability results.
|
|
jina_client = JinaClient()
|
|
html = jina_client.crawl(url, return_format="html")
|
|
extractor = ReadabilityExtractor()
|
|
article = extractor.extract_article(html)
|
|
article.url = url
|
|
return article
|