### What problem does this PR solve? Add agent component for web crawler ### Type of change - [x] New Feature (non-breaking change which adds functionality)tags/v0.13.0
| @@ -28,6 +28,7 @@ from .wencai import WenCai, WenCaiParam | |||
| from .jin10 import Jin10, Jin10Param | |||
| from .tushare import TuShare, TuShareParam | |||
| from .akshare import AkShare, AkShareParam | |||
| from .crawler import Crawler, CrawlerParam | |||
| def component_class(class_name): | |||
| @@ -0,0 +1,71 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from abc import ABC | |||
| import asyncio | |||
| from crawl4ai import AsyncWebCrawler | |||
| from agent.component.base import ComponentBase, ComponentParamBase | |||
| class CrawlerParam(ComponentParamBase): | |||
| """ | |||
| Define the Crawler component parameters. | |||
| """ | |||
| def __init__(self): | |||
| super().__init__() | |||
| def check(self): | |||
| return True | |||
| class Crawler(ComponentBase, ABC): | |||
| component_name = "Crawler" | |||
| def _run(self, history, **kwargs): | |||
| ans = self.get_input() | |||
| ans = " - ".join(ans["content"]) if "content" in ans else "" | |||
| if not ans: | |||
| return Crawler.be_output("") | |||
| try: | |||
| result = asyncio.run(self.get_web(ans)) | |||
| return Crawler.be_output(result) | |||
| except Exception as e: | |||
| return Crawler.be_output(f"An unexpected error occurred: {str(e)}") | |||
| async def get_web(self, url): | |||
| proxy = self._param.proxy if self._param.proxy else None | |||
| async with AsyncWebCrawler(verbose=True, proxy=proxy) as crawler: | |||
| result = await crawler.arun( | |||
| url=url, | |||
| bypass_cache=True | |||
| ) | |||
| match self._param.extract_type: | |||
| case 'html': | |||
| return result.cleaned_html | |||
| case 'markdown': | |||
| return result.markdown | |||
| case 'content': | |||
| return result.extracted_content | |||
| case _: | |||
| return result.markdown | |||
| # print(result.markdown) | |||
| @@ -0,0 +1 @@ | |||
| <?xml version="1.0" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg class="icon" width="200px" height="200.00px" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg"><path d="M 777.121 313.158 a 265.121 265.121 0 0 0 -530.243 0 Z m 165.7 265.121 H 843.402 V 425.836 l 84.176 -84.176 a 33.1402 33.1402 0 1 0 -47.0591 -47.0591 l -66.2803 66.2803 h -596.524 l -66.2803 -66.2803 a 33.1402 33.1402 0 1 0 -47.0591 47.0591 L 180.598 425.836 V 578.281 H 81.177 a 33.1402 33.1402 0 0 0 0 66.2803 H 180.598 v 33.1402 a 294.285 294.285 0 0 0 39.7682 145.817 l -96.1069 95.4441 a 33.1402 33.1402 0 0 0 47.0591 47.0591 l 88.8157 -88.1529 A 296.937 296.937 0 0 0 478.859 975.959 h 3.97682 V 401.974 h 66.2803 V 975.959 a 296.937 296.937 0 0 0 215.411 -98.0944 l 88.8157 88.8157 a 33.1402 33.1402 0 0 0 47.0591 -47.0591 l -93.4554 -93.4554 a 293.621 293.621 0 0 0 36.4542 -149.131 V 644.561 h 99.4209 a 33.1402 33.1402 0 0 0 0 -66.2803 Z" fill="#1B69FD" /></svg> | |||
| @@ -928,6 +928,16 @@ The above is the content you need to summarize.`, | |||
| yahooFinance: 'YahooFinance', | |||
| yahooFinanceDescription: | |||
| 'The component queries information about the company based on the provided ticker symbol.', | |||
| crawler: 'Web Crawler', | |||
| crawlerDescription: | |||
| 'This component can be used to crawl HTML source code from a specified URL.', | |||
| proxy: 'Proxy', | |||
| crawlerResultOptions: { | |||
| html: 'Html', | |||
| markdown: 'Markdown', | |||
| content: 'Content', | |||
| }, | |||
| extractType: 'extractType', | |||
| info: 'Info', | |||
| history: 'History', | |||
| financials: 'Financials', | |||
| @@ -877,6 +877,15 @@ export default { | |||
| akShareDescription: '此組件可用於從東方財富網取得對應股票的新聞資訊。', | |||
| yahooFinance: '雅虎財經', | |||
| yahooFinanceDescription: '該組件根據提供的股票代碼查詢有關公司的資訊。', | |||
| crawler: '網頁爬蟲', | |||
| crawlerDescription: '該組件可用於從指定url爬取HTML源碼。', | |||
| proxy: '代理', | |||
| crawlerResultOptions: { | |||
| html: 'Html', | |||
| markdown: 'Markdown', | |||
| content: '文本', | |||
| }, | |||
| extractType: '提取類型', | |||
| info: '訊息', | |||
| history: '歷史', | |||
| financials: '財務', | |||
| @@ -897,6 +897,15 @@ export default { | |||
| akShareDescription: '该组件可用于从东方财富网站获取相应股票的新闻信息。', | |||
| yahooFinance: '雅虎财经', | |||
| yahooFinanceDescription: '该组件根据提供的股票代码查询有关公司的信息。', | |||
| crawler: '网页爬虫', | |||
| crawlerDescription: '该组件可用于从指定url爬取html源码。', | |||
| proxy: '代理', | |||
| crawlerResultOptions: { | |||
| html: 'Html', | |||
| markdown: 'Markdown', | |||
| content: '文本', | |||
| }, | |||
| extractType: '提取类型', | |||
| info: '信息', | |||
| history: '历史', | |||
| financials: '财务', | |||
| @@ -4,6 +4,7 @@ import { ReactComponent as baiduFanyiIcon } from '@/assets/svg/baidu-fanyi.svg'; | |||
| import { ReactComponent as BaiduIcon } from '@/assets/svg/baidu.svg'; | |||
| import { ReactComponent as BingIcon } from '@/assets/svg/bing.svg'; | |||
| import { ReactComponent as ConcentratorIcon } from '@/assets/svg/concentrator.svg'; | |||
| import { ReactComponent as CrawlerIcon } from '@/assets/svg/crawler.svg'; | |||
| import { ReactComponent as DeepLIcon } from '@/assets/svg/deepl.svg'; | |||
| import { ReactComponent as DuckIcon } from '@/assets/svg/duck.svg'; | |||
| import { ReactComponent as ExeSqlIcon } from '@/assets/svg/exesql.svg'; | |||
| @@ -73,6 +74,7 @@ export enum Operator { | |||
| Concentrator = 'Concentrator', | |||
| TuShare = 'TuShare', | |||
| Note = 'Note', | |||
| Crawler = 'Crawler', | |||
| } | |||
| export const CommonOperatorList = Object.values(Operator).filter( | |||
| @@ -110,6 +112,7 @@ export const operatorIconMap = { | |||
| [Operator.Concentrator]: ConcentratorIcon, | |||
| [Operator.TuShare]: TuShareIcon, | |||
| [Operator.Note]: NoteIcon, | |||
| [Operator.Crawler]: CrawlerIcon, | |||
| }; | |||
| export const operatorMap: Record< | |||
| @@ -233,6 +236,9 @@ export const operatorMap: Record< | |||
| }, | |||
| [Operator.TuShare]: { backgroundColor: '#f8cfa0' }, | |||
| [Operator.Note]: { backgroundColor: '#f8cfa0' }, | |||
| [Operator.Crawler]: { | |||
| backgroundColor: '#dee0e2', | |||
| }, | |||
| }; | |||
| export const componentMenuList = [ | |||
| @@ -323,6 +329,9 @@ export const componentMenuList = [ | |||
| { | |||
| name: Operator.TuShare, | |||
| }, | |||
| { | |||
| name: Operator.Crawler, | |||
| }, | |||
| ]; | |||
| export const initialRetrievalValues = { | |||
| @@ -572,6 +581,7 @@ export const RestrictedUpstreamMap = { | |||
| [Operator.Jin10]: [Operator.Begin], | |||
| [Operator.Concentrator]: [Operator.Begin], | |||
| [Operator.TuShare]: [Operator.Begin], | |||
| [Operator.Crawler]: [Operator.Begin], | |||
| }; | |||
| export const NodeMap = { | |||
| @@ -605,6 +615,7 @@ export const NodeMap = { | |||
| [Operator.Jin10]: 'ragNode', | |||
| [Operator.TuShare]: 'ragNode', | |||
| [Operator.Note]: 'noteNode', | |||
| [Operator.Crawler]: 'ragNode', | |||
| }; | |||
| export const LanguageOptions = [ | |||
| @@ -2791,3 +2802,4 @@ export const TuShareSrcOptions = [ | |||
| 'fenghuang', | |||
| 'jinrongjie', | |||
| ]; | |||
| export const CrawlerResultOptions = ['markdown', 'html', 'content']; | |||
| @@ -12,6 +12,7 @@ import BaiduForm from '../form/baidu-form'; | |||
| import BeginForm from '../form/begin-form'; | |||
| import BingForm from '../form/bing-form'; | |||
| import CategorizeForm from '../form/categorize-form'; | |||
| import CrawlerForm from '../form/crawler-form'; | |||
| import DeepLForm from '../form/deepl-form'; | |||
| import DuckDuckGoForm from '../form/duckduckgo-form'; | |||
| import ExeSQLForm from '../form/exesql-form'; | |||
| @@ -70,6 +71,7 @@ const FormMap = { | |||
| [Operator.YahooFinance]: YahooFinanceForm, | |||
| [Operator.Jin10]: Jin10Form, | |||
| [Operator.TuShare]: TuShareForm, | |||
| [Operator.Crawler]: CrawlerForm, | |||
| }; | |||
| const EmptyContent = () => <div>empty</div>; | |||
| @@ -0,0 +1,37 @@ | |||
| import { useTranslate } from '@/hooks/common-hooks'; | |||
| import { Form, Input, Select } from 'antd'; | |||
| import { useMemo } from 'react'; | |||
| import { CrawlerResultOptions } from '../../constant'; | |||
| import { IOperatorForm } from '../../interface'; | |||
| const CrawlerForm = ({ onValuesChange, form }: IOperatorForm) => { | |||
| const { t } = useTranslate('flow'); | |||
| const crawlerResultOptions = useMemo(() => { | |||
| return CrawlerResultOptions.map((x) => ({ | |||
| value: x, | |||
| label: t(`crawlerResultOptions.${x}`), | |||
| })); | |||
| }, [t]); | |||
| return ( | |||
| <Form | |||
| name="basic" | |||
| labelCol={{ span: 6 }} | |||
| wrapperCol={{ span: 18 }} | |||
| autoComplete="off" | |||
| form={form} | |||
| onValuesChange={onValuesChange} | |||
| > | |||
| <Form.Item label={t('proxy')} name={'proxy'}> | |||
| <Input placeholder="like: http://127.0.0.1:8888"></Input> | |||
| </Form.Item> | |||
| <Form.Item | |||
| label={t('extractType')} | |||
| name={'extract_type'} | |||
| initialValue="markdown" | |||
| > | |||
| <Select options={crawlerResultOptions}></Select> | |||
| </Form.Item> | |||
| </Form> | |||
| ); | |||
| }; | |||
| export default CrawlerForm; | |||