I test web plugins like this:
from llama_index import download_loader
def BeautifulSoupWebReader(url):
BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader")
loader = BeautifulSoupWebReader()
return loader.load_data(urls=[url])
def SimpleWebPageReader(url):
SimpleWebPageReader = download_loader("SimpleWebPageReader")
loader = SimpleWebPageReader()
return loader.load_data(urls=[url])
def UnstructuredURLLoader(url):
UnstructuredURLLoader = download_loader("UnstructuredURLLoader")
loader = UnstructuredURLLoader(
urls=[url],
continue_on_failure=False,
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0"}
)
return loader.load()
def ReadabilityWebPageReader(url):
ReadabilityWebPageReader = download_loader("ReadabilityWebPageReader")
loader = ReadabilityWebPageReader()
return loader.load_data(url=url)
llmahub_web_plugin = {
# https://llamahub.ai/l/web-beautiful_soup_web
"BeautifulSoupWebReader": BeautifulSoupWebReader,
# https://llamahub.ai/l/web-simple_web
"SimpleWebPageReader": SimpleWebPageReader ,
# https://llamahub.ai/l/web-unstructured_web
"UnstructuredURLLoader": UnstructuredURLLoader,
# Use Playwright
# https://llamahub.ai/l/web-readability_web,
# "ReadabilityWebPageReader": ReadabilityWebPageReader,
}
for key_web_plugin in llmahub_web_plugin.keys():
documents = llmahub_web_plugin[key_web_plugin]('https://research.ibm.com/blog/retrieval-augmented-generation-RAG')
print(f"Llama Hub Web plugin:{key_web_plugin} \t Text length: {len(documents[0].text)}")