Skip indexing Github repo on hitting Github API rate limit

Sleep until rate limit passed is too expensive, as it keeps a
app worker occupied.

Ideally we should schedule job to contine after rate limit wait time
has passed. But this can only be added once we support jobs scheduling.
This commit is contained in:
Debanjum Singh Solanky
2024-04-08 23:22:45 +05:30
parent d5c9b5cb32
commit 079f409238

View File

@@ -1,6 +1,6 @@
import logging import logging
import time import time
from typing import Any, List, Tuple from typing import Any, Dict, List, Tuple
import requests import requests
@@ -63,7 +63,9 @@ class GithubToEntries(TextToEntries):
logger.info(f"Processing github repo {repo_shorthand}") logger.info(f"Processing github repo {repo_shorthand}")
with timer("Download markdown files from github repo", logger): with timer("Download markdown files from github repo", logger):
try: try:
markdown_files, org_files = self.get_files(repo_url, repo) markdown_files, org_files, plaintext_files = self.get_files(repo_url, repo)
except ConnectionAbortedError as e:
logger.error(f"Github rate limit reached. Skip indexing github repo {repo_shorthand}")
except Exception as e: except Exception as e:
logger.error(f"Unable to download github repo {repo_shorthand}", exc_info=True) logger.error(f"Unable to download github repo {repo_shorthand}", exc_info=True)
raise e raise e
@@ -109,10 +111,9 @@ class GithubToEntries(TextToEntries):
response = requests.get(repo_content_url, headers=headers, params=params) response = requests.get(repo_content_url, headers=headers, params=params)
contents = response.json() contents = response.json()
# Wait for rate limit reset if needed # Raise exception if hit rate limit
result = self.wait_for_rate_limit_reset(response, self.get_files, repo_url, repo) if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
if result is not None: raise ConnectionAbortedError("Github rate limit reached")
return result
# Extract markdown files from the repository # Extract markdown files from the repository
markdown_files: List[Any] = [] markdown_files: List[Any] = []
@@ -144,10 +145,9 @@ class GithubToEntries(TextToEntries):
headers = {"Accept": "application/vnd.github.v3.raw"} headers = {"Accept": "application/vnd.github.v3.raw"}
response = self.session.get(file_url, headers=headers, stream=True) response = self.session.get(file_url, headers=headers, stream=True)
# Wait for rate limit reset if needed # Stop indexing on hitting rate limit
result = self.wait_for_rate_limit_reset(response, self.get_file_contents, file_url) if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
if result is not None: raise ConnectionAbortedError("Github rate limit reached")
return result
content = "" content = ""
for chunk in response.iter_content(chunk_size=2048): for chunk in response.iter_content(chunk_size=2048):