mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-08 05:39:13 +00:00
Handle pagination, API rate limits. Get all commits from Github repo
This commit is contained in:
@@ -1,5 +1,6 @@
|
|||||||
# Standard Packages
|
# Standard Packages
|
||||||
import logging
|
import logging
|
||||||
|
import time
|
||||||
from typing import Dict, List
|
from typing import Dict, List
|
||||||
|
|
||||||
# External Packages
|
# External Packages
|
||||||
@@ -75,6 +76,13 @@ class GithubToJsonl(TextToJsonl):
|
|||||||
response = requests.get(repo_content_url, headers=headers)
|
response = requests.get(repo_content_url, headers=headers)
|
||||||
contents = response.json()
|
contents = response.json()
|
||||||
|
|
||||||
|
# If the rate limit is reached, wait for the reset time
|
||||||
|
if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
|
||||||
|
wait_time = int(response.headers.get("X-RateLimit-Reset")) - int(time.time())
|
||||||
|
logger.info(f"Github Rate limit reached. Waiting for {wait_time} seconds")
|
||||||
|
time.sleep(wait_time)
|
||||||
|
return self.get_markdown_files()
|
||||||
|
|
||||||
markdown_files = []
|
markdown_files = []
|
||||||
for item in contents["tree"]:
|
for item in contents["tree"]:
|
||||||
# Find all markdown files in the repository
|
# Find all markdown files in the repository
|
||||||
@@ -90,13 +98,28 @@ class GithubToJsonl(TextToJsonl):
|
|||||||
def get_commits(self) -> List[Dict]:
|
def get_commits(self) -> List[Dict]:
|
||||||
# Get commit messages from the repository using the Github API
|
# Get commit messages from the repository using the Github API
|
||||||
headers = {"Authorization": f"{self.config.pat_token}"}
|
headers = {"Authorization": f"{self.config.pat_token}"}
|
||||||
response = requests.get(f"{self.repo_url}/commits", headers=headers)
|
commits_url = f"{self.repo_url}/commits"
|
||||||
raw_commits = response.json()
|
|
||||||
|
|
||||||
# Extract commit messages from the response
|
|
||||||
commits = []
|
commits = []
|
||||||
for commit in raw_commits:
|
|
||||||
commits += [{"content": commit["commit"]["message"], "path": commit["html_url"]}]
|
while commits_url is not None:
|
||||||
|
# Get the next page of commits
|
||||||
|
response = requests.get(commits_url, headers=headers)
|
||||||
|
|
||||||
|
# If the rate limit is reached, wait for the reset time
|
||||||
|
if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
|
||||||
|
wait_time = int(response.headers.get("X-RateLimit-Reset")) - int(time.time())
|
||||||
|
logger.info(f"Github Rate limit reached. Waiting for {wait_time} seconds")
|
||||||
|
time.sleep(wait_time)
|
||||||
|
continue
|
||||||
|
|
||||||
|
raw_commits = response.json()
|
||||||
|
|
||||||
|
# Extract commit messages from the response
|
||||||
|
for commit in raw_commits:
|
||||||
|
commits += [{"content": commit["commit"]["message"], "path": commit["html_url"]}]
|
||||||
|
|
||||||
|
# Get the URL for the next page of commits, if any
|
||||||
|
commits_url = response.links.get("next", {}).get("url")
|
||||||
|
|
||||||
return commits
|
return commits
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user