mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-08 05:39:13 +00:00
Index commits message from repository with the github plugin
This commit is contained in:
@@ -1,12 +1,13 @@
|
|||||||
# Standard Packages
|
# Standard Packages
|
||||||
import logging
|
import logging
|
||||||
|
from typing import Dict, List
|
||||||
|
|
||||||
# External Packages
|
# External Packages
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.utils.helpers import timer
|
from khoj.utils.helpers import timer
|
||||||
from khoj.utils.rawconfig import GithubContentConfig
|
from khoj.utils.rawconfig import Entry, GithubContentConfig
|
||||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||||
from khoj.processor.text_to_jsonl import TextToJsonl
|
from khoj.processor.text_to_jsonl import TextToJsonl
|
||||||
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
|
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||||
@@ -36,6 +37,9 @@ class GithubToJsonl(TextToJsonl):
|
|||||||
*GithubToJsonl.extract_markdown_entries(docs)
|
*GithubToJsonl.extract_markdown_entries(docs)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
with timer("Extract commit messages from github repo", logger):
|
||||||
|
current_entries += self.convert_commits_to_entries(self.get_commits())
|
||||||
|
|
||||||
with timer("Split entries by max token size supported by model", logger):
|
with timer("Split entries by max token size supported by model", logger):
|
||||||
current_entries = TextToJsonl.split_entries_by_max_tokens(current_entries, max_tokens=256)
|
current_entries = TextToJsonl.split_entries_by_max_tokens(current_entries, max_tokens=256)
|
||||||
|
|
||||||
@@ -83,6 +87,34 @@ class GithubToJsonl(TextToJsonl):
|
|||||||
|
|
||||||
return markdown_files
|
return markdown_files
|
||||||
|
|
||||||
|
def get_commits(self) -> List[Dict]:
|
||||||
|
# Get commit messages from the repository using the Github API
|
||||||
|
headers = {"Authorization": f"{self.config.pat_token}"}
|
||||||
|
response = requests.get(f"{self.repo_url}/commits", headers=headers)
|
||||||
|
raw_commits = response.json()
|
||||||
|
|
||||||
|
# Extract commit messages from the response
|
||||||
|
commits = []
|
||||||
|
for commit in raw_commits:
|
||||||
|
commits += [{"content": commit["commit"]["message"], "path": commit["html_url"]}]
|
||||||
|
|
||||||
|
return commits
|
||||||
|
|
||||||
|
def convert_commits_to_entries(self, commits) -> List[Entry]:
|
||||||
|
entries: List[Entry] = []
|
||||||
|
for commit in commits:
|
||||||
|
compiled = f'Commit message from {self.config.repo_owner}/{self.config.repo_name}:\n{commit["content"]}'
|
||||||
|
entries.append(
|
||||||
|
Entry(
|
||||||
|
compiled=compiled,
|
||||||
|
raw=f'### {commit["content"]}',
|
||||||
|
heading=commit["content"].split("\n")[0],
|
||||||
|
file=commit["path"],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return entries
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def extract_markdown_entries(markdown_files):
|
def extract_markdown_entries(markdown_files):
|
||||||
entries = []
|
entries = []
|
||||||
|
|||||||
Reference in New Issue
Block a user