Merge pull request #253 from khoj-ai/features/github-issues-indexing

Support indexing Github issues as well as corresponding comments
2026-03-07 21:29:13 +00:00 · 2023-06-29 16:02:47 -07:00
parent 51dfa48e2b b2dd946c6d
commit 6c10d68262
24 changed files with 206 additions and 33 deletions
--- a/config/khoj_docker.yml
+++ b/config/khoj_docker.yml
@@ -11,7 +11,7 @@ content-type:
  markdown:
    input-files: null
-    input-filter: ["/data/markdown/**/*.md"]
+    input-filter: ["/data/markdown/**/*.markdown"]
    compressed-jsonl: "/data/embeddings/markdown.jsonl.gz"
    embeddings-file: "/data/embeddings/markdown_embeddings.pt"
--- a/src/khoj/interface/web/index.html
+++ b/src/khoj/interface/web/index.html
@@ -24,23 +24,26 @@
        }
        function render_org(query, data, classPrefix="") {
-            var orgCode = data.map(function (item) {
+            return data.map(function (item) {
-                return `${item.entry}`
+                var orgParser = new Org.Parser();
-            }).join("\n")
+                var orgDocument = orgParser.parse(item.entry);
-            var orgParser = new Org.Parser();
+                var orgHTMLDocument = orgDocument.convert(Org.ConverterHTML, { htmlClassPrefix: classPrefix });
-            var orgDocument = orgParser.parse(orgCode);
+                return `<div class="results-org">` + orgHTMLDocument.toString() + `</div>`;
-            var orgHTMLDocument = orgDocument.convert(Org.ConverterHTML, { htmlClassPrefix: classPrefix });
+            }).join("\n");
            return `<div class="results-org">` + orgHTMLDocument.toString() + `</div>`;
        }
        function render_markdown(query, data) {
            var md = window.markdownit();
            return data.map(function (item) {
                let rendered = "";
                if (item.additional.file.startsWith("http")) {
                    lines = item.entry.split("\n");
-                    return md.render(`${lines[0]}\t[*](${item.additional.file})\n${lines.slice(1).join("\n")}`);
+                    rendered = md.render(`${lines[0]}\t[*](${item.additional.file})\n${lines.slice(1).join("\n")}`);
                }
-                return `<div class="results-markdown">` + md.render(`${item.entry}`) + `</div>`;
+                else {
                    rendered = md.render(`${item.entry}`);
                }
                return `<div class="results-markdown">` + rendered + `</div>`;
            }).join("\n");
        }
@@ -59,16 +62,21 @@
            }).join("\n");
        }
-        function render_mutliple(query, data, type) {
+        function render_multiple(query, data, type) {
            let html = "";
            data.forEach(item => {
-               if (item.additional.file.endsWith(".org")) {
+                if (item.additional.file.endsWith(".org")) {
-                html += render_org(query, [item], "org-");
+                    html += render_org(query, [item], "org-");
-               } else if (item.additional.file.endsWith(".md")) {
+                } else if (
-                html += render_markdown(query, [item]);
+                    item.additional.file.endsWith(".md") ||
-               } else if (item.additional.file.endsWith(".pdf")) {
+                    item.additional.file.endsWith(".markdown") ||
-                html += render_pdf(query, [item]);
+                        (item.additional.file.includes("issues") && item.additional.file.includes("github.com"))
-               }
+                    )
                {
                    html += render_markdown(query, [item]);
                } else if (item.additional.file.endsWith(".pdf")) {
                    html += render_pdf(query, [item]);
                }
            });
            return html;
        }
@@ -88,11 +96,25 @@
            } else if (type === "pdf") {
                results = render_pdf(query, data);
            } else if (type === "github" || type === "all") {
-                results = render_mutliple(query, data, type);
+                results = render_multiple(query, data, type);
            } else {
                results = data.map((item) => `<div class="results-plugin">` + `<p>${item.entry}</p>` + `</div>`).join("\n")
            }
-            return `<div id="results-${type}">${results}</div>`;
+
            // Any POST rendering goes here.
            let renderedResults = document.createElement("div");
            renderedResults.id = `results-${type}`;
            renderedResults.innerHTML = results;
            // For all elements that are of type img in the results html and have a src with 'avatar' in the URL, add the class 'avatar'
            // This is used to make the avatar images round
            let images = renderedResults.querySelectorAll("img[src*='avatar']");
            for (let i = 0; i < images.length; i++) {
                images[i].classList.add("avatar");
            }
            return renderedResults.outerHTML;
        }
        function search(rerank=false) {
@@ -265,7 +287,6 @@
            margin: 0px;
            background: #f8fafc;
            color: #475569;
            text-align: center;
            font-family: roboto, karma, segoe ui, sans-serif;
            font-size: 20px;
            font-weight: 300;
@@ -371,6 +392,28 @@
            max-width: 100;
        }
        a {
            color: #3b82f6;
            text-decoration: none;
        }
        img.avatar {
            width: 20px;
            height: 20px;
            border-radius: 50%;
        }
        div.results-markdown,
        div.results-org,
        div.results-pdf {
            text-align: left;
            box-shadow: 2px 2px 2px var(--primary-hover);
            border-radius: 5px;
            padding: 10px;
            margin: 10px 0;
            border: 1px solid rgb(229, 229, 229);
        }
    </style>
 </html>
--- a/src/khoj/processor/github/github_to_jsonl.py
+++ b/src/khoj/processor/github/github_to_jsonl.py
@@ -1,7 +1,8 @@
 # Standard Packages
 import logging
 import time
-from typing import Dict, List
+from datetime import datetime
 from typing import Dict, List, Union
 # External Packages
 import requests
@@ -13,6 +14,7 @@ from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
 from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
 from khoj.processor.text_to_jsonl import TextToJsonl
 from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
 from khoj.utils.rawconfig import Entry
 logger = logging.getLogger(__name__)
@@ -22,6 +24,8 @@ class GithubToJsonl(TextToJsonl):
    def __init__(self, config: GithubContentConfig):
        super().__init__(config)
        self.config = config
        self.session = requests.Session()
        self.session.headers.update({"Authorization": f"token {self.config.pat_token}"})
    @staticmethod
    def wait_for_rate_limit_reset(response, func, *args, **kwargs):
@@ -53,6 +57,7 @@ class GithubToJsonl(TextToJsonl):
        logger.info(f"Found {len(markdown_files)} markdown files in github repo {repo_shorthand}")
        logger.info(f"Found {len(org_files)} org files in github repo {repo_shorthand}")
        current_entries = []
        with timer(f"Extract markdown entries from github repo {repo_shorthand}", logger):
            current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps(
@@ -65,6 +70,12 @@ class GithubToJsonl(TextToJsonl):
        with timer(f"Extract commit messages from github repo {repo_shorthand}", logger):
            current_entries += self.convert_commits_to_entries(self.get_commits(repo_url), repo)
        with timer(f"Extract issues from github repo {repo_shorthand}", logger):
            issue_entries = GithubToJsonl.convert_issues_to_entries(
                *GithubToJsonl.extract_github_issues(self.get_issues(repo_url))
            )
            current_entries += issue_entries
        with timer(f"Split entries by max token size supported by model {repo_shorthand}", logger):
            current_entries = TextToJsonl.split_entries_by_max_tokens(current_entries, max_tokens=256)
@@ -102,7 +113,7 @@ class GithubToJsonl(TextToJsonl):
        contents = response.json()
        # Wait for rate limit reset if needed
-        result = self.wait_for_rate_limit_reset(response, self.get_files)
+        result = self.wait_for_rate_limit_reset(response, self.get_files, repo_url, repo)
        if result is not None:
            return result
@@ -130,35 +141,43 @@ class GithubToJsonl(TextToJsonl):
    def get_file_contents(self, file_url):
        # Get text from each markdown file
-        headers = {"Authorization": f"token {self.config.pat_token}", "Accept": "application/vnd.github.v3.raw"}
+        headers = {"Accept": "application/vnd.github.v3.raw"}
-        response = requests.get(file_url, headers=headers)
+        response = self.session.get(file_url, headers=headers, stream=True)
        # Wait for rate limit reset if needed
        result = self.wait_for_rate_limit_reset(response, self.get_file_contents, file_url)
        if result is not None:
            return result
-        return response.content.decode("utf-8")
+        content = ""
        for chunk in response.iter_content(chunk_size=2048):
            if chunk:
                content += chunk.decode("utf-8")
        return content
    def get_commits(self, repo_url: str) -> List[Dict]:
        return self._get_commits(f"{repo_url}/commits")
    def _get_commits(self, commits_url: Union[str, None]) -> List[Dict]:
        # Get commit messages from the repository using the Github API
        commits_url = f"{repo_url}/commits"
        headers = {"Authorization": f"token {self.config.pat_token}"}
        params = {"per_page": 100}
        commits = []
        while commits_url is not None:
            # Get the next page of commits
-            response = requests.get(commits_url, headers=headers, params=params)
+            response = self.session.get(commits_url, params=params, stream=True)
-            raw_commits = response.json()
+
            # Read the streamed response into a JSON object
            content = response.json()
            # Wait for rate limit reset if needed
-            result = self.wait_for_rate_limit_reset(response, self.get_commits)
+            result = self.wait_for_rate_limit_reset(response, self._get_commits, commits_url)
            if result is not None:
                return result
            # Extract commit messages from the response
-            for commit in raw_commits:
+            for commit in content:
                commits += [{"content": commit["commit"]["message"], "path": commit["html_url"]}]
            # Get the URL for the next page of commits, if any
@@ -166,6 +185,74 @@ class GithubToJsonl(TextToJsonl):
        return commits
    def get_issues(self, repo_url: str) -> List[Dict]:
        return self._get_issues(f"{repo_url}/issues")
    def _get_issues(self, issues_url: Union[str, None]) -> List[Dict]:
        issues = []
        per_page = 30
        params = {"per_page": per_page, "state": "all"}
        while issues_url is not None:
            # Get the next page of issues
            response = self.session.get(issues_url, params=params, stream=True)  # type: ignore
            raw_issues = response.json()
            # Wait for rate limit reset if needed
            result = self.wait_for_rate_limit_reset(response, self._get_issues, issues_url)
            if result is not None:
                return result
            for issue in raw_issues:
                username = issue["user"]["login"]
                user_url = f"[{username}]({issue['user']['html_url']})"
                issue_content = {
                    "content": f"## [Issue {issue['number']}]({issue['html_url']}) {issue['title']}\nby {user_url}\n\n{issue['body']}",
                    "path": issue["html_url"],
                }
                issue_content["created_at"] = {issue["created_at"]}
                if issue["comments"] > 0:
                    issue_content["comments"] = self.get_comments(issue["comments_url"])
                issues += [issue_content]
            issues_url = response.links.get("next", {}).get("url")
        return issues
    def get_comments(self, comments_url: Union[str, None]) -> List[Dict]:
        # By default, the number of results per page is 30. We'll keep it as-is for now.
        comments = []
        per_page = 30
        params = {"per_page": per_page}
        while comments_url is not None:
            # Get the next page of comments
            response = self.session.get(comments_url, params=params, stream=True)
            raw_comments = response.json()
            # Wait for rate limit reset if needed
            result = self.wait_for_rate_limit_reset(response, self.get_comments, comments_url)
            if result is not None:
                return result
            for comment in raw_comments:
                created_at = datetime.strptime(comment["created_at"], "%Y-%m-%dT%H:%M:%SZ").strftime("%Y-%m-%d %H:%M")
                commenter = comment["user"]["login"]
                commenter_url = comment["user"]["html_url"]
                comment_url = comment["html_url"]
                comment_url_link = f"[{created_at}]({comment_url})"
                avatar_url = comment["user"]["avatar_url"]
                avatar = f"![{commenter}]({avatar_url})"
                comments += [
                    {
                        "content": f"### {avatar} [{commenter}]({commenter_url}) - ({comment_url_link})\n\n{comment['body']}"
                    }
                ]
            comments_url = response.links.get("next", {}).get("url")
        return comments
    def convert_commits_to_entries(self, commits, repo: GithubRepoConfig) -> List[Entry]:
        entries: List[Entry] = []
        for commit in commits:
@@ -201,3 +288,32 @@ class GithubToJsonl(TextToJsonl):
                doc["content"], doc["path"], entries, entry_to_file_map
            )
        return entries, dict(entry_to_file_map)
    @staticmethod
    def extract_github_issues(issues):
        entries = []
        entry_to_file_map = {}
        for issue in issues:
            content = issue["content"]
            if "comments" in issue:
                for comment in issue["comments"]:
                    content += "\n\n" + comment["content"]
            entries.append(content)
            entry_to_file_map[content] = {"path": issue["path"]}
        return entries, entry_to_file_map
    @staticmethod
    def convert_issues_to_entries(parsed_entries: List[str], entry_to_metadata_map: Dict[str, Dict]) -> List[Entry]:
        entries = []
        for entry in parsed_entries:
            entry_file_name = entry_to_metadata_map[entry]["path"]
            entries.append(
                Entry(
                    compiled=entry,
                    raw=entry,
                    heading=entry.split("\n")[0],
                    file=entry_file_name,
                )
            )
        return entries
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@@ -216,6 +216,20 @@ async def search(
                )
            ]
        if (t == SearchType.Github or t == SearchType.All) and state.model.github_search:
            # query github issues
            search_futures += [
                executor.submit(
                    text_search.query,
                    user_query,
                    state.model.github_search,
                    question_embedding=encoded_asymmetric_query,
                    rank_results=r or False,
                    score_threshold=score_threshold,
                    dedupe=dedupe or True,
                )
            ]
        if (t == SearchType.Pdf or t == SearchType.All) and state.model.pdf_search:
            # query pdf files
            search_futures += [
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -120,7 +120,7 @@ def md_content_config(tmp_path_factory):
    content_config = ContentConfig()
    content_config.markdown = TextContentConfig(
        input_files=None,
-        input_filter=["tests/data/markdown/*.md"],
+        input_filter=["tests/data/markdown/*.markdown"],
        compressed_jsonl=content_dir.joinpath("markdown.jsonl"),
        embeddings_file=content_dir.joinpath("markdown_embeddings.pt"),
    )
--- a/tests/data/markdown/Birthday
+++ b/tests/data/markdown/Birthday
--- a/tests/data/markdown/Hike
+++ b/tests/data/markdown/Hike
--- a/tests/data/markdown/Meet
+++ b/tests/data/markdown/Meet
--- a/tests/data/markdown/Miscellaneous
+++ b/tests/data/markdown/Miscellaneous
--- a/tests/data/markdown/Namita.markdown
+++ b/tests/data/markdown/Namita.markdown
--- a/tests/data/markdown/Patent
+++ b/tests/data/markdown/Patent
--- a/tests/data/markdown/Preparing
+++ b/tests/data/markdown/Preparing
--- a/tests/data/markdown/Sign
+++ b/tests/data/markdown/Sign
--- a/tests/data/markdown/Submit
+++ b/tests/data/markdown/Submit
--- a/tests/data/markdown/Visit
+++ b/tests/data/markdown/Visit
--- a/tests/data/markdown/Xi
+++ b/tests/data/markdown/Xi
--- a/tests/data/markdown/copy_what_you_like.markdown
+++ b/tests/data/markdown/copy_what_you_like.markdown
--- a/tests/data/markdown/having_kids.markdown
+++ b/tests/data/markdown/having_kids.markdown
--- a/tests/data/markdown/how_y_combinator_started.markdown
+++ b/tests/data/markdown/how_y_combinator_started.markdown
--- a/tests/data/markdown/jessica_livingston.markdown
+++ b/tests/data/markdown/jessica_livingston.markdown
--- a/tests/data/markdown/undergraduation.markdown
+++ b/tests/data/markdown/undergraduation.markdown
--- a/tests/data/markdown/what_i_did_this_summer.markdown
+++ b/tests/data/markdown/what_i_did_this_summer.markdown
--- a/tests/data/markdown/what_i_worked_on.markdown
+++ b/tests/data/markdown/what_i_worked_on.markdown
--- a/tests/data/markdown/why_yc.markdown
+++ b/tests/data/markdown/why_yc.markdown