Index commits message from repository with the github plugin

This commit is contained in:
Debanjum Singh Solanky
2023-06-17 02:50:58 -07:00
parent c29c141a7e
commit 31d17d0b22

View File

@@ -1,12 +1,13 @@
# Standard Packages
import logging
from typing import Dict, List
# External Packages
import requests
# Internal Packages
from khoj.utils.helpers import timer
from khoj.utils.rawconfig import GithubContentConfig
from khoj.utils.rawconfig import Entry, GithubContentConfig
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
from khoj.processor.text_to_jsonl import TextToJsonl
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
@@ -36,6 +37,9 @@ class GithubToJsonl(TextToJsonl):
*GithubToJsonl.extract_markdown_entries(docs)
)
with timer("Extract commit messages from github repo", logger):
current_entries += self.convert_commits_to_entries(self.get_commits())
with timer("Split entries by max token size supported by model", logger):
current_entries = TextToJsonl.split_entries_by_max_tokens(current_entries, max_tokens=256)
@@ -83,6 +87,34 @@ class GithubToJsonl(TextToJsonl):
return markdown_files
def get_commits(self) -> List[Dict]:
# Get commit messages from the repository using the Github API
headers = {"Authorization": f"{self.config.pat_token}"}
response = requests.get(f"{self.repo_url}/commits", headers=headers)
raw_commits = response.json()
# Extract commit messages from the response
commits = []
for commit in raw_commits:
commits += [{"content": commit["commit"]["message"], "path": commit["html_url"]}]
return commits
def convert_commits_to_entries(self, commits) -> List[Entry]:
entries: List[Entry] = []
for commit in commits:
compiled = f'Commit message from {self.config.repo_owner}/{self.config.repo_name}:\n{commit["content"]}'
entries.append(
Entry(
compiled=compiled,
raw=f'### {commit["content"]}',
heading=commit["content"].split("\n")[0],
file=commit["path"],
)
)
return entries
@staticmethod
def extract_markdown_entries(markdown_files):
entries = []