mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-09 13:25:11 +00:00
Get all md files recursively in repository by passing recursive param
Previously the `get_markdown_files' method was only getting files at root of the repository Fix, improve logger messages in github to jsonl processor
This commit is contained in:
@@ -38,10 +38,10 @@ class GithubToJsonl(TextToJsonl):
|
|||||||
try:
|
try:
|
||||||
docs = self.get_markdown_files()
|
docs = self.get_markdown_files()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Unable to download github repo for {self.config.repo_owner}/{self.config.repo_name}")
|
logger.error(f"Unable to download github repo {self.config.repo_owner}/{self.config.repo_name}")
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
logger.info(f"Found {len(docs)} documents in {self.config.repo_owner}/{self.config.repo_name}")
|
logger.info(f"Found {len(docs)} documents in github repo {self.config.repo_owner}/{self.config.repo_name}")
|
||||||
|
|
||||||
with timer("Extract markdown entries from github repo", logger):
|
with timer("Extract markdown entries from github repo", logger):
|
||||||
current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps(
|
current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps(
|
||||||
@@ -63,7 +63,7 @@ class GithubToJsonl(TextToJsonl):
|
|||||||
current_entries, previous_entries, key="compiled", logger=logger
|
current_entries, previous_entries, key="compiled", logger=logger
|
||||||
)
|
)
|
||||||
|
|
||||||
with timer("Write markdown entries to JSONL file", logger):
|
with timer("Write github entries to JSONL file", logger):
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
entries = list(map(lambda entry: entry[1], entries_with_ids))
|
entries = list(map(lambda entry: entry[1], entries_with_ids))
|
||||||
jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries)
|
jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries)
|
||||||
@@ -80,7 +80,8 @@ class GithubToJsonl(TextToJsonl):
|
|||||||
# Get the contents of the repository
|
# Get the contents of the repository
|
||||||
repo_content_url = f"{self.repo_url}/git/trees/{self.config.repo_branch}"
|
repo_content_url = f"{self.repo_url}/git/trees/{self.config.repo_branch}"
|
||||||
headers = {"Authorization": f"token {self.config.pat_token}"}
|
headers = {"Authorization": f"token {self.config.pat_token}"}
|
||||||
response = requests.get(repo_content_url, headers=headers)
|
params = {"recursive": "true"}
|
||||||
|
response = requests.get(repo_content_url, headers=headers, params=params)
|
||||||
contents = response.json()
|
contents = response.json()
|
||||||
|
|
||||||
# Wait for rate limit reset if needed
|
# Wait for rate limit reset if needed
|
||||||
|
|||||||
Reference in New Issue
Block a user