{"index_struct_id": "53347f08-c351-42a4-a6c3-8ebc46a95fee", "docstore": {"docs": {"53347f08-c351-42a4-a6c3-8ebc46a95fee": {"text": null, "doc_id": "53347f08-c351-42a4-a6c3-8ebc46a95fee", "embedding": null, "extra_info": null, "nodes_dict": {}, "id_map": {}, "embeddings_dict": {}, "__type__": "simple_dict"}}}, "vector_store": {"simple_vector_store_data_dict": {"embedding_dict": {}, "text_id_to_doc_id": {}}}}
src/Umbraco.Web.Common
to include it but looks like the reader somehow ignores it.Checking src\Umbraco.Web.Common whether to FilterType.INCLUDE it based on the filter directories: ['src/Umbraco.Web.Common'] ignoring directory Umbraco.Web.Common due to filter
download_loader
, try download_loader(..., refresh_cache=True)
pip freeze > uninstall.txt && pip uninstall -y -r uninstall.txt && pip cache purge && pip install --upgrade httpx llama-index && python main.py
from llama_index import download_loader download_loader("GithubRepositoryReader", refresh_cache=True, loader_hub_url="https://raw.githubusercontent.com/ahmetkca/llama-hub/github-reader-test-and-fix/loader_hub") from llama_index.readers.llamahub_modules.github_repo import GithubRepositoryReader, GithubClient def main(): github_client = GithubClient() github_repo_reader = GithubRepositoryReader( github_client, owner = "umbraco", repo = "Umbraco-CMS", use_parser = False, filter_directories = (["src/Umbraco.Web.Common"], GithubRepositoryReader.FilterType.INCLUDE), filter_file_extensions = ([".cs"], GithubRepositoryReader.FilterType.INCLUDE), verbose = True, concurrent_requests = 2, ) docs = github_repo_reader.load_data(branch="v10/main") for doc in docs: print(doc.extra_info) if __name__ == "__main__": main()
concurrent_requests
it is 5 by default which means the GithubRepoReader
will retrieve 5 files concurrently.concurrent_requests
but it also means there will be high chance you will encounter with ConnectionTimeout
because of the rate limiting by GitHub. I suggest 5 or below.from llama_index import download_loader download_loader("GithubRepositoryReader", refresh_cache=True) from llama_index.readers.llamahub_modules.github_repo import GithubRepositoryReader, GithubClient def main(): github_client = GithubClient() github_repo_reader = GithubRepositoryReader( github_client, owner = "umbraco", repo = "Umbraco-CMS", use_parser = False, filter_directories = (["src/Umbraco.Web.Common"], GithubRepositoryReader.FilterType.INCLUDE), filter_file_extensions = ([".cs"], GithubRepositoryReader.FilterType.INCLUDE), verbose = True, concurrent_requests = 2, ) docs = github_repo_reader.load_data(branch="v10/main") for doc in docs: print(doc.extra_info) if __name__ == "__main__": main()