Skip indexing Github repo on hitting Github API rate limit

Sleep until rate limit passed is too expensive, as it keeps a
app worker occupied.

Ideally we should schedule job to contine after rate limit wait time
has passed. But this can only be added once we support jobs scheduling.
This commit is contained in:
Debanjum Singh Solanky
2024-04-08 23:22:45 +05:30
parent d5c9b5cb32
commit 079f409238

View File

@@ -1,6 +1,6 @@
import logging
import time
from typing import Any, List, Tuple
from typing import Any, Dict, List, Tuple
import requests
@@ -63,7 +63,9 @@ class GithubToEntries(TextToEntries):
logger.info(f"Processing github repo {repo_shorthand}")
with timer("Download markdown files from github repo", logger):
try:
markdown_files, org_files = self.get_files(repo_url, repo)
markdown_files, org_files, plaintext_files = self.get_files(repo_url, repo)
except ConnectionAbortedError as e:
logger.error(f"Github rate limit reached. Skip indexing github repo {repo_shorthand}")
except Exception as e:
logger.error(f"Unable to download github repo {repo_shorthand}", exc_info=True)
raise e
@@ -109,10 +111,9 @@ class GithubToEntries(TextToEntries):
response = requests.get(repo_content_url, headers=headers, params=params)
contents = response.json()
# Wait for rate limit reset if needed
result = self.wait_for_rate_limit_reset(response, self.get_files, repo_url, repo)
if result is not None:
return result
# Raise exception if hit rate limit
if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
raise ConnectionAbortedError("Github rate limit reached")
# Extract markdown files from the repository
markdown_files: List[Any] = []
@@ -144,10 +145,9 @@ class GithubToEntries(TextToEntries):
headers = {"Accept": "application/vnd.github.v3.raw"}
response = self.session.get(file_url, headers=headers, stream=True)
# Wait for rate limit reset if needed
result = self.wait_for_rate_limit_reset(response, self.get_file_contents, file_url)
if result is not None:
return result
# Stop indexing on hitting rate limit
if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
raise ConnectionAbortedError("Github rate limit reached")
content = ""
for chunk in response.iter_content(chunk_size=2048):