mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-09 05:39:12 +00:00
Skip indexing Github repo on hitting Github API rate limit
Sleep until rate limit passed is too expensive, as it keeps a app worker occupied. Ideally we should schedule job to contine after rate limit wait time has passed. But this can only be added once we support jobs scheduling.
This commit is contained in:
@@ -1,6 +1,6 @@
|
|||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
from typing import Any, List, Tuple
|
from typing import Any, Dict, List, Tuple
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
@@ -63,7 +63,9 @@ class GithubToEntries(TextToEntries):
|
|||||||
logger.info(f"Processing github repo {repo_shorthand}")
|
logger.info(f"Processing github repo {repo_shorthand}")
|
||||||
with timer("Download markdown files from github repo", logger):
|
with timer("Download markdown files from github repo", logger):
|
||||||
try:
|
try:
|
||||||
markdown_files, org_files = self.get_files(repo_url, repo)
|
markdown_files, org_files, plaintext_files = self.get_files(repo_url, repo)
|
||||||
|
except ConnectionAbortedError as e:
|
||||||
|
logger.error(f"Github rate limit reached. Skip indexing github repo {repo_shorthand}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Unable to download github repo {repo_shorthand}", exc_info=True)
|
logger.error(f"Unable to download github repo {repo_shorthand}", exc_info=True)
|
||||||
raise e
|
raise e
|
||||||
@@ -109,10 +111,9 @@ class GithubToEntries(TextToEntries):
|
|||||||
response = requests.get(repo_content_url, headers=headers, params=params)
|
response = requests.get(repo_content_url, headers=headers, params=params)
|
||||||
contents = response.json()
|
contents = response.json()
|
||||||
|
|
||||||
# Wait for rate limit reset if needed
|
# Raise exception if hit rate limit
|
||||||
result = self.wait_for_rate_limit_reset(response, self.get_files, repo_url, repo)
|
if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
|
||||||
if result is not None:
|
raise ConnectionAbortedError("Github rate limit reached")
|
||||||
return result
|
|
||||||
|
|
||||||
# Extract markdown files from the repository
|
# Extract markdown files from the repository
|
||||||
markdown_files: List[Any] = []
|
markdown_files: List[Any] = []
|
||||||
@@ -144,10 +145,9 @@ class GithubToEntries(TextToEntries):
|
|||||||
headers = {"Accept": "application/vnd.github.v3.raw"}
|
headers = {"Accept": "application/vnd.github.v3.raw"}
|
||||||
response = self.session.get(file_url, headers=headers, stream=True)
|
response = self.session.get(file_url, headers=headers, stream=True)
|
||||||
|
|
||||||
# Wait for rate limit reset if needed
|
# Stop indexing on hitting rate limit
|
||||||
result = self.wait_for_rate_limit_reset(response, self.get_file_contents, file_url)
|
if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
|
||||||
if result is not None:
|
raise ConnectionAbortedError("Github rate limit reached")
|
||||||
return result
|
|
||||||
|
|
||||||
content = ""
|
content = ""
|
||||||
for chunk in response.iter_content(chunk_size=2048):
|
for chunk in response.iter_content(chunk_size=2048):
|
||||||
|
|||||||
Reference in New Issue
Block a user