mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-07 21:29:13 +00:00
Merge pull request #253 from khoj-ai/features/github-issues-indexing
Support indexing Github issues as well as corresponding comments
This commit is contained in:
@@ -11,7 +11,7 @@ content-type:
|
|||||||
|
|
||||||
markdown:
|
markdown:
|
||||||
input-files: null
|
input-files: null
|
||||||
input-filter: ["/data/markdown/**/*.md"]
|
input-filter: ["/data/markdown/**/*.markdown"]
|
||||||
compressed-jsonl: "/data/embeddings/markdown.jsonl.gz"
|
compressed-jsonl: "/data/embeddings/markdown.jsonl.gz"
|
||||||
embeddings-file: "/data/embeddings/markdown_embeddings.pt"
|
embeddings-file: "/data/embeddings/markdown_embeddings.pt"
|
||||||
|
|
||||||
|
|||||||
@@ -24,23 +24,26 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
function render_org(query, data, classPrefix="") {
|
function render_org(query, data, classPrefix="") {
|
||||||
var orgCode = data.map(function (item) {
|
return data.map(function (item) {
|
||||||
return `${item.entry}`
|
var orgParser = new Org.Parser();
|
||||||
}).join("\n")
|
var orgDocument = orgParser.parse(item.entry);
|
||||||
var orgParser = new Org.Parser();
|
var orgHTMLDocument = orgDocument.convert(Org.ConverterHTML, { htmlClassPrefix: classPrefix });
|
||||||
var orgDocument = orgParser.parse(orgCode);
|
return `<div class="results-org">` + orgHTMLDocument.toString() + `</div>`;
|
||||||
var orgHTMLDocument = orgDocument.convert(Org.ConverterHTML, { htmlClassPrefix: classPrefix });
|
}).join("\n");
|
||||||
return `<div class="results-org">` + orgHTMLDocument.toString() + `</div>`;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function render_markdown(query, data) {
|
function render_markdown(query, data) {
|
||||||
var md = window.markdownit();
|
var md = window.markdownit();
|
||||||
return data.map(function (item) {
|
return data.map(function (item) {
|
||||||
|
let rendered = "";
|
||||||
if (item.additional.file.startsWith("http")) {
|
if (item.additional.file.startsWith("http")) {
|
||||||
lines = item.entry.split("\n");
|
lines = item.entry.split("\n");
|
||||||
return md.render(`${lines[0]}\t[*](${item.additional.file})\n${lines.slice(1).join("\n")}`);
|
rendered = md.render(`${lines[0]}\t[*](${item.additional.file})\n${lines.slice(1).join("\n")}`);
|
||||||
}
|
}
|
||||||
return `<div class="results-markdown">` + md.render(`${item.entry}`) + `</div>`;
|
else {
|
||||||
|
rendered = md.render(`${item.entry}`);
|
||||||
|
}
|
||||||
|
return `<div class="results-markdown">` + rendered + `</div>`;
|
||||||
}).join("\n");
|
}).join("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -59,16 +62,21 @@
|
|||||||
}).join("\n");
|
}).join("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
function render_mutliple(query, data, type) {
|
function render_multiple(query, data, type) {
|
||||||
let html = "";
|
let html = "";
|
||||||
data.forEach(item => {
|
data.forEach(item => {
|
||||||
if (item.additional.file.endsWith(".org")) {
|
if (item.additional.file.endsWith(".org")) {
|
||||||
html += render_org(query, [item], "org-");
|
html += render_org(query, [item], "org-");
|
||||||
} else if (item.additional.file.endsWith(".md")) {
|
} else if (
|
||||||
html += render_markdown(query, [item]);
|
item.additional.file.endsWith(".md") ||
|
||||||
} else if (item.additional.file.endsWith(".pdf")) {
|
item.additional.file.endsWith(".markdown") ||
|
||||||
html += render_pdf(query, [item]);
|
(item.additional.file.includes("issues") && item.additional.file.includes("github.com"))
|
||||||
}
|
)
|
||||||
|
{
|
||||||
|
html += render_markdown(query, [item]);
|
||||||
|
} else if (item.additional.file.endsWith(".pdf")) {
|
||||||
|
html += render_pdf(query, [item]);
|
||||||
|
}
|
||||||
});
|
});
|
||||||
return html;
|
return html;
|
||||||
}
|
}
|
||||||
@@ -88,11 +96,25 @@
|
|||||||
} else if (type === "pdf") {
|
} else if (type === "pdf") {
|
||||||
results = render_pdf(query, data);
|
results = render_pdf(query, data);
|
||||||
} else if (type === "github" || type === "all") {
|
} else if (type === "github" || type === "all") {
|
||||||
results = render_mutliple(query, data, type);
|
results = render_multiple(query, data, type);
|
||||||
} else {
|
} else {
|
||||||
results = data.map((item) => `<div class="results-plugin">` + `<p>${item.entry}</p>` + `</div>`).join("\n")
|
results = data.map((item) => `<div class="results-plugin">` + `<p>${item.entry}</p>` + `</div>`).join("\n")
|
||||||
}
|
}
|
||||||
return `<div id="results-${type}">${results}</div>`;
|
|
||||||
|
// Any POST rendering goes here.
|
||||||
|
|
||||||
|
let renderedResults = document.createElement("div");
|
||||||
|
renderedResults.id = `results-${type}`;
|
||||||
|
renderedResults.innerHTML = results;
|
||||||
|
|
||||||
|
// For all elements that are of type img in the results html and have a src with 'avatar' in the URL, add the class 'avatar'
|
||||||
|
// This is used to make the avatar images round
|
||||||
|
let images = renderedResults.querySelectorAll("img[src*='avatar']");
|
||||||
|
for (let i = 0; i < images.length; i++) {
|
||||||
|
images[i].classList.add("avatar");
|
||||||
|
}
|
||||||
|
|
||||||
|
return renderedResults.outerHTML;
|
||||||
}
|
}
|
||||||
|
|
||||||
function search(rerank=false) {
|
function search(rerank=false) {
|
||||||
@@ -265,7 +287,6 @@
|
|||||||
margin: 0px;
|
margin: 0px;
|
||||||
background: #f8fafc;
|
background: #f8fafc;
|
||||||
color: #475569;
|
color: #475569;
|
||||||
text-align: center;
|
|
||||||
font-family: roboto, karma, segoe ui, sans-serif;
|
font-family: roboto, karma, segoe ui, sans-serif;
|
||||||
font-size: 20px;
|
font-size: 20px;
|
||||||
font-weight: 300;
|
font-weight: 300;
|
||||||
@@ -371,6 +392,28 @@
|
|||||||
max-width: 100;
|
max-width: 100;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
a {
|
||||||
|
color: #3b82f6;
|
||||||
|
text-decoration: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
img.avatar {
|
||||||
|
width: 20px;
|
||||||
|
height: 20px;
|
||||||
|
border-radius: 50%;
|
||||||
|
}
|
||||||
|
|
||||||
|
div.results-markdown,
|
||||||
|
div.results-org,
|
||||||
|
div.results-pdf {
|
||||||
|
text-align: left;
|
||||||
|
box-shadow: 2px 2px 2px var(--primary-hover);
|
||||||
|
border-radius: 5px;
|
||||||
|
padding: 10px;
|
||||||
|
margin: 10px 0;
|
||||||
|
border: 1px solid rgb(229, 229, 229);
|
||||||
|
}
|
||||||
|
|
||||||
</style>
|
</style>
|
||||||
|
|
||||||
</html>
|
</html>
|
||||||
|
|||||||
@@ -1,7 +1,8 @@
|
|||||||
# Standard Packages
|
# Standard Packages
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
from typing import Dict, List
|
from datetime import datetime
|
||||||
|
from typing import Dict, List, Union
|
||||||
|
|
||||||
# External Packages
|
# External Packages
|
||||||
import requests
|
import requests
|
||||||
@@ -13,6 +14,7 @@ from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
|||||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||||
from khoj.processor.text_to_jsonl import TextToJsonl
|
from khoj.processor.text_to_jsonl import TextToJsonl
|
||||||
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
|
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||||
|
from khoj.utils.rawconfig import Entry
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -22,6 +24,8 @@ class GithubToJsonl(TextToJsonl):
|
|||||||
def __init__(self, config: GithubContentConfig):
|
def __init__(self, config: GithubContentConfig):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
self.config = config
|
self.config = config
|
||||||
|
self.session = requests.Session()
|
||||||
|
self.session.headers.update({"Authorization": f"token {self.config.pat_token}"})
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def wait_for_rate_limit_reset(response, func, *args, **kwargs):
|
def wait_for_rate_limit_reset(response, func, *args, **kwargs):
|
||||||
@@ -53,6 +57,7 @@ class GithubToJsonl(TextToJsonl):
|
|||||||
|
|
||||||
logger.info(f"Found {len(markdown_files)} markdown files in github repo {repo_shorthand}")
|
logger.info(f"Found {len(markdown_files)} markdown files in github repo {repo_shorthand}")
|
||||||
logger.info(f"Found {len(org_files)} org files in github repo {repo_shorthand}")
|
logger.info(f"Found {len(org_files)} org files in github repo {repo_shorthand}")
|
||||||
|
current_entries = []
|
||||||
|
|
||||||
with timer(f"Extract markdown entries from github repo {repo_shorthand}", logger):
|
with timer(f"Extract markdown entries from github repo {repo_shorthand}", logger):
|
||||||
current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps(
|
current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps(
|
||||||
@@ -65,6 +70,12 @@ class GithubToJsonl(TextToJsonl):
|
|||||||
with timer(f"Extract commit messages from github repo {repo_shorthand}", logger):
|
with timer(f"Extract commit messages from github repo {repo_shorthand}", logger):
|
||||||
current_entries += self.convert_commits_to_entries(self.get_commits(repo_url), repo)
|
current_entries += self.convert_commits_to_entries(self.get_commits(repo_url), repo)
|
||||||
|
|
||||||
|
with timer(f"Extract issues from github repo {repo_shorthand}", logger):
|
||||||
|
issue_entries = GithubToJsonl.convert_issues_to_entries(
|
||||||
|
*GithubToJsonl.extract_github_issues(self.get_issues(repo_url))
|
||||||
|
)
|
||||||
|
current_entries += issue_entries
|
||||||
|
|
||||||
with timer(f"Split entries by max token size supported by model {repo_shorthand}", logger):
|
with timer(f"Split entries by max token size supported by model {repo_shorthand}", logger):
|
||||||
current_entries = TextToJsonl.split_entries_by_max_tokens(current_entries, max_tokens=256)
|
current_entries = TextToJsonl.split_entries_by_max_tokens(current_entries, max_tokens=256)
|
||||||
|
|
||||||
@@ -102,7 +113,7 @@ class GithubToJsonl(TextToJsonl):
|
|||||||
contents = response.json()
|
contents = response.json()
|
||||||
|
|
||||||
# Wait for rate limit reset if needed
|
# Wait for rate limit reset if needed
|
||||||
result = self.wait_for_rate_limit_reset(response, self.get_files)
|
result = self.wait_for_rate_limit_reset(response, self.get_files, repo_url, repo)
|
||||||
if result is not None:
|
if result is not None:
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@@ -130,35 +141,43 @@ class GithubToJsonl(TextToJsonl):
|
|||||||
|
|
||||||
def get_file_contents(self, file_url):
|
def get_file_contents(self, file_url):
|
||||||
# Get text from each markdown file
|
# Get text from each markdown file
|
||||||
headers = {"Authorization": f"token {self.config.pat_token}", "Accept": "application/vnd.github.v3.raw"}
|
headers = {"Accept": "application/vnd.github.v3.raw"}
|
||||||
response = requests.get(file_url, headers=headers)
|
response = self.session.get(file_url, headers=headers, stream=True)
|
||||||
|
|
||||||
# Wait for rate limit reset if needed
|
# Wait for rate limit reset if needed
|
||||||
result = self.wait_for_rate_limit_reset(response, self.get_file_contents, file_url)
|
result = self.wait_for_rate_limit_reset(response, self.get_file_contents, file_url)
|
||||||
if result is not None:
|
if result is not None:
|
||||||
return result
|
return result
|
||||||
|
|
||||||
return response.content.decode("utf-8")
|
content = ""
|
||||||
|
for chunk in response.iter_content(chunk_size=2048):
|
||||||
|
if chunk:
|
||||||
|
content += chunk.decode("utf-8")
|
||||||
|
|
||||||
|
return content
|
||||||
|
|
||||||
def get_commits(self, repo_url: str) -> List[Dict]:
|
def get_commits(self, repo_url: str) -> List[Dict]:
|
||||||
|
return self._get_commits(f"{repo_url}/commits")
|
||||||
|
|
||||||
|
def _get_commits(self, commits_url: Union[str, None]) -> List[Dict]:
|
||||||
# Get commit messages from the repository using the Github API
|
# Get commit messages from the repository using the Github API
|
||||||
commits_url = f"{repo_url}/commits"
|
|
||||||
headers = {"Authorization": f"token {self.config.pat_token}"}
|
|
||||||
params = {"per_page": 100}
|
params = {"per_page": 100}
|
||||||
commits = []
|
commits = []
|
||||||
|
|
||||||
while commits_url is not None:
|
while commits_url is not None:
|
||||||
# Get the next page of commits
|
# Get the next page of commits
|
||||||
response = requests.get(commits_url, headers=headers, params=params)
|
response = self.session.get(commits_url, params=params, stream=True)
|
||||||
raw_commits = response.json()
|
|
||||||
|
# Read the streamed response into a JSON object
|
||||||
|
content = response.json()
|
||||||
|
|
||||||
# Wait for rate limit reset if needed
|
# Wait for rate limit reset if needed
|
||||||
result = self.wait_for_rate_limit_reset(response, self.get_commits)
|
result = self.wait_for_rate_limit_reset(response, self._get_commits, commits_url)
|
||||||
if result is not None:
|
if result is not None:
|
||||||
return result
|
return result
|
||||||
|
|
||||||
# Extract commit messages from the response
|
# Extract commit messages from the response
|
||||||
for commit in raw_commits:
|
for commit in content:
|
||||||
commits += [{"content": commit["commit"]["message"], "path": commit["html_url"]}]
|
commits += [{"content": commit["commit"]["message"], "path": commit["html_url"]}]
|
||||||
|
|
||||||
# Get the URL for the next page of commits, if any
|
# Get the URL for the next page of commits, if any
|
||||||
@@ -166,6 +185,74 @@ class GithubToJsonl(TextToJsonl):
|
|||||||
|
|
||||||
return commits
|
return commits
|
||||||
|
|
||||||
|
def get_issues(self, repo_url: str) -> List[Dict]:
|
||||||
|
return self._get_issues(f"{repo_url}/issues")
|
||||||
|
|
||||||
|
def _get_issues(self, issues_url: Union[str, None]) -> List[Dict]:
|
||||||
|
issues = []
|
||||||
|
per_page = 30
|
||||||
|
params = {"per_page": per_page, "state": "all"}
|
||||||
|
|
||||||
|
while issues_url is not None:
|
||||||
|
# Get the next page of issues
|
||||||
|
response = self.session.get(issues_url, params=params, stream=True) # type: ignore
|
||||||
|
raw_issues = response.json()
|
||||||
|
|
||||||
|
# Wait for rate limit reset if needed
|
||||||
|
result = self.wait_for_rate_limit_reset(response, self._get_issues, issues_url)
|
||||||
|
if result is not None:
|
||||||
|
return result
|
||||||
|
|
||||||
|
for issue in raw_issues:
|
||||||
|
username = issue["user"]["login"]
|
||||||
|
user_url = f"[{username}]({issue['user']['html_url']})"
|
||||||
|
issue_content = {
|
||||||
|
"content": f"## [Issue {issue['number']}]({issue['html_url']}) {issue['title']}\nby {user_url}\n\n{issue['body']}",
|
||||||
|
"path": issue["html_url"],
|
||||||
|
}
|
||||||
|
issue_content["created_at"] = {issue["created_at"]}
|
||||||
|
if issue["comments"] > 0:
|
||||||
|
issue_content["comments"] = self.get_comments(issue["comments_url"])
|
||||||
|
issues += [issue_content]
|
||||||
|
|
||||||
|
issues_url = response.links.get("next", {}).get("url")
|
||||||
|
|
||||||
|
return issues
|
||||||
|
|
||||||
|
def get_comments(self, comments_url: Union[str, None]) -> List[Dict]:
|
||||||
|
# By default, the number of results per page is 30. We'll keep it as-is for now.
|
||||||
|
comments = []
|
||||||
|
per_page = 30
|
||||||
|
params = {"per_page": per_page}
|
||||||
|
|
||||||
|
while comments_url is not None:
|
||||||
|
# Get the next page of comments
|
||||||
|
response = self.session.get(comments_url, params=params, stream=True)
|
||||||
|
raw_comments = response.json()
|
||||||
|
|
||||||
|
# Wait for rate limit reset if needed
|
||||||
|
result = self.wait_for_rate_limit_reset(response, self.get_comments, comments_url)
|
||||||
|
if result is not None:
|
||||||
|
return result
|
||||||
|
|
||||||
|
for comment in raw_comments:
|
||||||
|
created_at = datetime.strptime(comment["created_at"], "%Y-%m-%dT%H:%M:%SZ").strftime("%Y-%m-%d %H:%M")
|
||||||
|
commenter = comment["user"]["login"]
|
||||||
|
commenter_url = comment["user"]["html_url"]
|
||||||
|
comment_url = comment["html_url"]
|
||||||
|
comment_url_link = f"[{created_at}]({comment_url})"
|
||||||
|
avatar_url = comment["user"]["avatar_url"]
|
||||||
|
avatar = f""
|
||||||
|
comments += [
|
||||||
|
{
|
||||||
|
"content": f"### {avatar} [{commenter}]({commenter_url}) - ({comment_url_link})\n\n{comment['body']}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
comments_url = response.links.get("next", {}).get("url")
|
||||||
|
|
||||||
|
return comments
|
||||||
|
|
||||||
def convert_commits_to_entries(self, commits, repo: GithubRepoConfig) -> List[Entry]:
|
def convert_commits_to_entries(self, commits, repo: GithubRepoConfig) -> List[Entry]:
|
||||||
entries: List[Entry] = []
|
entries: List[Entry] = []
|
||||||
for commit in commits:
|
for commit in commits:
|
||||||
@@ -201,3 +288,32 @@ class GithubToJsonl(TextToJsonl):
|
|||||||
doc["content"], doc["path"], entries, entry_to_file_map
|
doc["content"], doc["path"], entries, entry_to_file_map
|
||||||
)
|
)
|
||||||
return entries, dict(entry_to_file_map)
|
return entries, dict(entry_to_file_map)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def extract_github_issues(issues):
|
||||||
|
entries = []
|
||||||
|
entry_to_file_map = {}
|
||||||
|
for issue in issues:
|
||||||
|
content = issue["content"]
|
||||||
|
if "comments" in issue:
|
||||||
|
for comment in issue["comments"]:
|
||||||
|
content += "\n\n" + comment["content"]
|
||||||
|
entries.append(content)
|
||||||
|
entry_to_file_map[content] = {"path": issue["path"]}
|
||||||
|
return entries, entry_to_file_map
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def convert_issues_to_entries(parsed_entries: List[str], entry_to_metadata_map: Dict[str, Dict]) -> List[Entry]:
|
||||||
|
entries = []
|
||||||
|
for entry in parsed_entries:
|
||||||
|
entry_file_name = entry_to_metadata_map[entry]["path"]
|
||||||
|
entries.append(
|
||||||
|
Entry(
|
||||||
|
compiled=entry,
|
||||||
|
raw=entry,
|
||||||
|
heading=entry.split("\n")[0],
|
||||||
|
file=entry_file_name,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return entries
|
||||||
|
|||||||
@@ -216,6 +216,20 @@ async def search(
|
|||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
if (t == SearchType.Github or t == SearchType.All) and state.model.github_search:
|
||||||
|
# query github issues
|
||||||
|
search_futures += [
|
||||||
|
executor.submit(
|
||||||
|
text_search.query,
|
||||||
|
user_query,
|
||||||
|
state.model.github_search,
|
||||||
|
question_embedding=encoded_asymmetric_query,
|
||||||
|
rank_results=r or False,
|
||||||
|
score_threshold=score_threshold,
|
||||||
|
dedupe=dedupe or True,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
if (t == SearchType.Pdf or t == SearchType.All) and state.model.pdf_search:
|
if (t == SearchType.Pdf or t == SearchType.All) and state.model.pdf_search:
|
||||||
# query pdf files
|
# query pdf files
|
||||||
search_futures += [
|
search_futures += [
|
||||||
|
|||||||
@@ -120,7 +120,7 @@ def md_content_config(tmp_path_factory):
|
|||||||
content_config = ContentConfig()
|
content_config = ContentConfig()
|
||||||
content_config.markdown = TextContentConfig(
|
content_config.markdown = TextContentConfig(
|
||||||
input_files=None,
|
input_files=None,
|
||||||
input_filter=["tests/data/markdown/*.md"],
|
input_filter=["tests/data/markdown/*.markdown"],
|
||||||
compressed_jsonl=content_dir.joinpath("markdown.jsonl"),
|
compressed_jsonl=content_dir.joinpath("markdown.jsonl"),
|
||||||
embeddings_file=content_dir.joinpath("markdown_embeddings.pt"),
|
embeddings_file=content_dir.joinpath("markdown_embeddings.pt"),
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user