From 9ca61d62ff3270e135f8aed330f937a214fd905c Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Mon, 15 May 2023 20:51:49 +0800 Subject: [PATCH 1/8] Enable/disable logging telemetry by setting bool in khoj.yml config We log usage telemetry by default, unless setting explicitly set in khoj.yml --- src/khoj/utils/rawconfig.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/khoj/utils/rawconfig.py b/src/khoj/utils/rawconfig.py index 51d64381..bc8ef78a 100644 --- a/src/khoj/utils/rawconfig.py +++ b/src/khoj/utils/rawconfig.py @@ -89,10 +89,15 @@ class ProcessorConfig(ConfigBase): conversation: Optional[ConversationProcessorConfig] +class AppConfig(ConfigBase): + should_log_telemetry: bool + + class FullConfig(ConfigBase): content_type: Optional[ContentConfig] search_type: Optional[SearchConfig] processor: Optional[ProcessorConfig] + app: Optional[AppConfig] = AppConfig(should_log_telemetry=True) class SearchResponse(ConfigBase): From f2e89f6f465a67a0d3e72705b89fcacf562b5a35 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Mon, 15 May 2023 20:57:54 +0800 Subject: [PATCH 2/8] Add khoj app helper methods to log app usage to a telemetry server --- src/khoj/utils/constants.py | 2 + src/khoj/utils/helpers.py | 82 ++++++++++++++++++++++++++++++++++--- 2 files changed, 79 insertions(+), 5 deletions(-) diff --git a/src/khoj/utils/constants.py b/src/khoj/utils/constants.py index 71722c1a..aa10a4d3 100644 --- a/src/khoj/utils/constants.py +++ b/src/khoj/utils/constants.py @@ -3,6 +3,8 @@ from pathlib import Path app_root_directory = Path(__file__).parent.parent.parent web_directory = app_root_directory / "khoj/interface/web/" empty_escape_sequences = "\n|\r|\t| " +app_env_filepath = "~/.khoj/env" +telemetry_server = "https://khoj.beta.haletic.com/v1/telemetry" # default app config to use default_config = { diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py index 521b7b65..73d25c98 100644 --- a/src/khoj/utils/helpers.py +++ b/src/khoj/utils/helpers.py @@ -1,14 +1,22 @@ # Standard Packages from __future__ import annotations # to avoid quoting type hints -import logging -import sys -import torch from collections import OrderedDict +import datetime from importlib import import_module -from os.path import join +import logging +from os import path from pathlib import Path +import platform +import requests +import sys from time import perf_counter +import torch from typing import Optional, Union, TYPE_CHECKING +import uuid + +# Internal Packages +from khoj.utils import constants + if TYPE_CHECKING: # External Packages @@ -16,6 +24,7 @@ if TYPE_CHECKING: # Internal Packages from khoj.utils.models import BaseEncoder + from khoj.utils.rawconfig import AppConfig def is_none_or_empty(item): @@ -59,7 +68,7 @@ def load_model(model_name: str, model_type, model_dir=None, device: str = None) "Load model from disk or huggingface" # Construct model path logger = logging.getLogger(__name__) - model_path = join(model_dir, model_name.replace("/", "_")) if model_dir is not None else None + model_path = path.join(model_dir, model_name.replace("/", "_")) if model_dir is not None else None # Load model from model_path if it exists there model_type_class = get_class_by_name(model_type) if isinstance(model_type, str) else model_type @@ -123,3 +132,66 @@ class LRU(OrderedDict): if len(self) > self.capacity: oldest = next(iter(self)) del self[oldest] + + +def get_server_id(): + """Get, Generate Persistent, Random ID per server install. + Helps count distinct khoj servers deployed. + Maintains anonymity by using non-PII random id.""" + # Expand path to the khoj env file. It contains persistent internal app data + app_env_filename = path.expanduser(constants.app_env_filepath) + + # Check if the file exists + if path.exists(app_env_filename): + # Read the contents of the file + with open(app_env_filename, "r") as f: + contents = f.readlines() + + # Extract the server_id from the contents + for line in contents: + key, value = line.strip().split("=") + if key.strip() == "server_id": + server_id = value.strip() + break + else: + # If server_id is not found, generate a new one + server_id = str(uuid.uuid4()) + + else: + # Generate a new server id + server_id = str(uuid.uuid4()) + + # Write the server_id to the file + with open(app_env_filename, "w") as f: + f.write("server_id=" + server_id + "\n") + + return server_id + + +def log_telemetry(telemetry_type: str, api: str = None, client: str = None, app_config: AppConfig = None): + """Log basic app usage telemetry like client, os, api called""" + # Do not log usage telemetry, if telemetry is disabled via app config + if not app_config or not app_config.should_log_telemetry: + return + + # Populate telemetry data to log + request_body = { + "telemetry_type": telemetry_type, + "server_id": get_server_id(), + "os": platform.system(), + "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + } + if api: + # API endpoint on server called by client + request_body["api"] = api + if client: + # Client from which the API was called. E.g Emacs, Obsidian + request_body["client"] = client + + # Log telemetry data to telemetry endpoint + logger = logging.getLogger(__name__) + try: + logger.debug(f"Log usage telemetry to {constants.telemetry_server}: {request_body}") + requests.post(constants.telemetry_server, json=request_body) + except: + pass From 3ede919c6668b72425b38b7a9f588e231a089662 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Mon, 15 May 2023 20:58:51 +0800 Subject: [PATCH 3/8] Log usage of /search, /chat, /update API endpoints to telemetry server --- src/khoj/routers/api.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py index 6aa72b38..d92da70f 100644 --- a/src/khoj/routers/api.py +++ b/src/khoj/routers/api.py @@ -14,7 +14,7 @@ from khoj.configure import configure_processor, configure_search from khoj.processor.conversation.gpt import converse, extract_questions from khoj.processor.conversation.utils import message_to_log, message_to_prompt from khoj.search_type import image_search, text_search -from khoj.utils.helpers import timer +from khoj.utils.helpers import log_telemetry, timer from khoj.utils.rawconfig import FullConfig, SearchResponse from khoj.utils.state import SearchType from khoj.utils import state, constants @@ -168,6 +168,8 @@ def search( # Cache results state.query_cache[query_cache_key] = results + log_telemetry(telemetry_type="api", api="search", app_config=state.config.app) + return results @@ -191,6 +193,8 @@ def update(t: Optional[SearchType] = None, force: Optional[bool] = False): else: logger.info("📬 Processor reconfigured via API") + log_telemetry(telemetry_type="api", api="update", app_config=state.config.app) + return {"status": "ok", "message": "khoj reloaded"} @@ -251,4 +255,6 @@ def chat(q: Optional[str] = None): conversation_log=meta_log.get("chat", []), ) + log_telemetry(telemetry_type="api", api="chat", app_config=state.config.app) + return {"status": status, "response": gpt_response, "context": compiled_references} From 134cce9d3236993f8a003bac2b2a47ad8c772c39 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 16 May 2023 23:32:18 +0530 Subject: [PATCH 4/8] Batch upload telemetry data at regular interval instead of while querying --- src/khoj/configure.py | 16 ++++++++++++++++ src/khoj/routers/api.py | 6 +++--- src/khoj/utils/helpers.py | 9 ++------- src/khoj/utils/state.py | 3 ++- 4 files changed, 23 insertions(+), 11 deletions(-) diff --git a/src/khoj/configure.py b/src/khoj/configure.py index c6c9e308..e6746f31 100644 --- a/src/khoj/configure.py +++ b/src/khoj/configure.py @@ -3,6 +3,7 @@ import sys import logging import json from enum import Enum +import requests # External Packages import schedule @@ -223,3 +224,18 @@ def save_chat_session(): state.processor_config.conversation.chat_session = None logger.info("📩 Saved current chat session to conversation logs") + + +@schedule.repeat(schedule.every(1).minutes) +def upload_telemetry(): + if not state.config.app.should_log_telemetry or not state.telemetry: + print("No telemetry to upload") if not state.telemetry else print("Telemetry logging disabled") + return + + try: + logger.debug(f"📡 Upload usage telemetry to {constants.telemetry_server}: {state.telemetry}") + requests.post(constants.telemetry_server, json=state.telemetry) + except Exception as e: + logger.error(f"Error uploading telemetry: {e}") + else: + state.telemetry = None diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py index d92da70f..21493557 100644 --- a/src/khoj/routers/api.py +++ b/src/khoj/routers/api.py @@ -168,7 +168,7 @@ def search( # Cache results state.query_cache[query_cache_key] = results - log_telemetry(telemetry_type="api", api="search", app_config=state.config.app) + state.telemetry += [log_telemetry(telemetry_type="api", api="search", app_config=state.config.app)] return results @@ -193,7 +193,7 @@ def update(t: Optional[SearchType] = None, force: Optional[bool] = False): else: logger.info("📬 Processor reconfigured via API") - log_telemetry(telemetry_type="api", api="update", app_config=state.config.app) + state.telemetry += [log_telemetry(telemetry_type="api", api="update", app_config=state.config.app)] return {"status": "ok", "message": "khoj reloaded"} @@ -255,6 +255,6 @@ def chat(q: Optional[str] = None): conversation_log=meta_log.get("chat", []), ) - log_telemetry(telemetry_type="api", api="chat", app_config=state.config.app) + state.telemetry += [log_telemetry(telemetry_type="api", api="chat", app_config=state.config.app)] return {"status": status, "response": gpt_response, "context": compiled_references} diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py index 73d25c98..e9581769 100644 --- a/src/khoj/utils/helpers.py +++ b/src/khoj/utils/helpers.py @@ -172,7 +172,7 @@ def log_telemetry(telemetry_type: str, api: str = None, client: str = None, app_ """Log basic app usage telemetry like client, os, api called""" # Do not log usage telemetry, if telemetry is disabled via app config if not app_config or not app_config.should_log_telemetry: - return + return [] # Populate telemetry data to log request_body = { @@ -189,9 +189,4 @@ def log_telemetry(telemetry_type: str, api: str = None, client: str = None, app_ request_body["client"] = client # Log telemetry data to telemetry endpoint - logger = logging.getLogger(__name__) - try: - logger.debug(f"Log usage telemetry to {constants.telemetry_server}: {request_body}") - requests.post(constants.telemetry_server, json=request_body) - except: - pass + return request_body diff --git a/src/khoj/utils/state.py b/src/khoj/utils/state.py index 7a38bfdf..9d5ed27f 100644 --- a/src/khoj/utils/state.py +++ b/src/khoj/utils/state.py @@ -1,6 +1,6 @@ # Standard Packages import threading -from typing import List +from typing import List, Dict from packaging import version # External Packages @@ -25,6 +25,7 @@ cli_args: List[str] = None query_cache = LRU() search_index_lock = threading.Lock() SearchType = utils_config.SearchType +telemetry: List[Dict[str, str]] = [] if torch.cuda.is_available(): # Use CUDA GPU From d42f0f5055de28c6fbfaeb52b6b13f4a6e81e55c Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Mon, 15 May 2023 22:55:49 +0800 Subject: [PATCH 5/8] Add basic telemetry server for khoj --- src/khoj/configure.py | 9 ++--- src/khoj/routers/api.py | 5 ++- src/khoj/utils/state.py | 1 + src/telemetry/requirements.txt | 2 ++ src/telemetry/telemetry.py | 65 ++++++++++++++++++++++++++++++++++ 5 files changed, 77 insertions(+), 5 deletions(-) create mode 100644 src/telemetry/requirements.txt create mode 100644 src/telemetry/telemetry.py diff --git a/src/khoj/configure.py b/src/khoj/configure.py index e6746f31..bca90cb4 100644 --- a/src/khoj/configure.py +++ b/src/khoj/configure.py @@ -229,13 +229,14 @@ def save_chat_session(): @schedule.repeat(schedule.every(1).minutes) def upload_telemetry(): if not state.config.app.should_log_telemetry or not state.telemetry: - print("No telemetry to upload") if not state.telemetry else print("Telemetry logging disabled") + message = "📡 No telemetry to upload" if not state.telemetry else "📡 Telemetry logging disabled" + logger.debug(message) return try: - logger.debug(f"📡 Upload usage telemetry to {constants.telemetry_server}: {state.telemetry}") + logger.debug(f"📡 Upload usage telemetry to {constants.telemetry_server}:\n{state.telemetry}") requests.post(constants.telemetry_server, json=state.telemetry) except Exception as e: - logger.error(f"Error uploading telemetry: {e}") + logger.error(f"📡 Error uploading telemetry: {e}") else: - state.telemetry = None + state.telemetry = [] diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py index 21493557..0c7f278f 100644 --- a/src/khoj/routers/api.py +++ b/src/khoj/routers/api.py @@ -168,7 +168,10 @@ def search( # Cache results state.query_cache[query_cache_key] = results - state.telemetry += [log_telemetry(telemetry_type="api", api="search", app_config=state.config.app)] + # Only log telemetry if query is new and not a continuation of previous query + if state.previous_query is None or state.previous_query not in user_query: + state.telemetry += [log_telemetry(telemetry_type="api", api="search", app_config=state.config.app)] + state.previous_query = user_query return results diff --git a/src/khoj/utils/state.py b/src/khoj/utils/state.py index 9d5ed27f..a3368084 100644 --- a/src/khoj/utils/state.py +++ b/src/khoj/utils/state.py @@ -26,6 +26,7 @@ query_cache = LRU() search_index_lock = threading.Lock() SearchType = utils_config.SearchType telemetry: List[Dict[str, str]] = [] +previous_query: str = None if torch.cuda.is_available(): # Use CUDA GPU diff --git a/src/telemetry/requirements.txt b/src/telemetry/requirements.txt new file mode 100644 index 00000000..405599c4 --- /dev/null +++ b/src/telemetry/requirements.txt @@ -0,0 +1,2 @@ +uvicorn +fastapi diff --git a/src/telemetry/telemetry.py b/src/telemetry/telemetry.py new file mode 100644 index 00000000..2a96b237 --- /dev/null +++ b/src/telemetry/telemetry.py @@ -0,0 +1,65 @@ +# Standard Packages +import argparse +import logging +from typing import Dict, List + +# External Packages +from fastapi import FastAPI +from fastapi import HTTPException +import sqlite3 +import uvicorn + + +# Initialize Global App Variables +app = FastAPI() +sqlfile = "khoj.sqlite" +logger = logging.getLogger() +logger.setLevel(logging.DEBUG) + + +@app.post("/v1/telemetry") +def v1_telemetry(telemetry_data: List[Dict[str, str]]): + # Throw exception if no telemetry data received in POST request body + if len(telemetry_data) == 0: + error_message = "Post body is empty. It should contain some telemetry data" + logger.error(error_message) + raise HTTPException(status_code=500, detail=error_message) + + # Insert recieved telemetry data into SQLite db + logger.info(f"Insert row into telemetry table: {telemetry_data}") + with sqlite3.connect(sqlfile) as conn: + cur = conn.cursor() + + # Create a table if it doesn't exist + cur.execute( + """CREATE TABLE IF NOT EXISTS usage (id INTEGER PRIMARY KEY, time TIMESTAMP, type TEXT, server_id TEXT, os TEXT, api TEXT, client TEXT)""" + ) + + # Log telemetry data + for item in telemetry_data: + cur.execute( + "INSERT INTO usage (time, type, server_id, os, api, client) VALUES (?, ?, ?, ?, ?, ?)", + ( + item["timestamp"], + item["telemetry_type"], + item["server_id"], + item["os"], + item.get("api"), + item.get("client"), + ), + ) + # Commit the changes + conn.commit() + + return {"status": "ok", "message": "Logged usage telemetry"} + + +if __name__ == "__main__": + # Setup Argument Parser + parser = argparse.ArgumentParser(description="Start Khoj Telemetry Server") + parser.add_argument("--host", default="127.0.0.1", type=str, help="I.P of telemetry server") + parser.add_argument("--port", "-p", default=80, type=int, help="Port of telemetry server") + args = parser.parse_args() + + # Start Application Server + uvicorn.run(app, host=args.host, port=args.port, log_level="debug") From 07b19964d4b68c7c1a487067395161aa8233cc1c Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 17 May 2023 09:45:26 +0530 Subject: [PATCH 6/8] Schedule jobs at (co-)prime intervals to reduce overlap in job runs --- src/khoj/configure.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/khoj/configure.py b/src/khoj/configure.py index bca90cb4..448c8bde 100644 --- a/src/khoj/configure.py +++ b/src/khoj/configure.py @@ -63,7 +63,7 @@ def configure_routes(app): app.include_router(web_client) -@schedule.repeat(schedule.every(1).hour) +@schedule.repeat(schedule.every(61).minutes) def update_search_index(): state.search_index_lock.acquire() state.model = configure_search(state.model, state.config, regenerate=False) @@ -190,7 +190,7 @@ def configure_conversation_processor(conversation_processor_config): return conversation_processor -@schedule.repeat(schedule.every(15).minutes) +@schedule.repeat(schedule.every(17).minutes) def save_chat_session(): # No need to create empty log file if not ( @@ -226,7 +226,7 @@ def save_chat_session(): logger.info("📩 Saved current chat session to conversation logs") -@schedule.repeat(schedule.every(1).minutes) +@schedule.repeat(schedule.every(59).minutes) def upload_telemetry(): if not state.config.app.should_log_telemetry or not state.telemetry: message = "📡 No telemetry to upload" if not state.telemetry else "📡 Telemetry logging disabled" From e9f04dc644cc27add596ff779b62c89a90ea8074 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 17 May 2023 11:46:19 +0530 Subject: [PATCH 7/8] Add dockerfile to containerize telemetry server --- src/telemetry/Dockerfile | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 src/telemetry/Dockerfile diff --git a/src/telemetry/Dockerfile b/src/telemetry/Dockerfile new file mode 100644 index 00000000..049bd8e5 --- /dev/null +++ b/src/telemetry/Dockerfile @@ -0,0 +1,10 @@ +# Get Base Image +FROM tiangolo/uvicorn-gunicorn:python3.11-slim +LABEL org.opencontainers.image.source https://github.com/debanjum/khoj + +# Install Telemetry Server Dependencies +COPY requirements.txt /tmp/requirements.txt +RUN pip install --no-cache-dir -r /tmp/requirements.txt + +# Copy Application +COPY telemetry.py /app/main.py From 55d72231b3356bff440a9c25fff552660d766e51 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 17 May 2023 11:46:34 +0530 Subject: [PATCH 8/8] Generate docker image for telemetry server using Github workflow --- .../workflows/dockerize_telemetry_server.yml | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 .github/workflows/dockerize_telemetry_server.yml diff --git a/.github/workflows/dockerize_telemetry_server.yml b/.github/workflows/dockerize_telemetry_server.yml new file mode 100644 index 00000000..ae50db4b --- /dev/null +++ b/.github/workflows/dockerize_telemetry_server.yml @@ -0,0 +1,45 @@ +name: dockerize telemetry server + +on: + push: + branches: + - master + paths: + - src/telemetry/** + - .github/workflows/dockerize_telemetry_server.yml + pull_request: + branches: + - master + paths: + - src/telemetry/** + - .github/workflows/dockerize_telemetry_server.yml + workflow_dispatch: + +env: + DOCKER_IMAGE_TAG: ${{ github.ref == 'refs/heads/master' && 'latest' || github.event.pull_request.number }} + +jobs: + build: + name: Build Docker Image, Push to Container Registry + runs-on: ubuntu-latest + steps: + - name: Checkout Code + uses: actions/checkout@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.PAT }} + + - name: 📦 Build and Push Docker Image + uses: docker/build-push-action@v2 + with: + context: src/telemetry + file: src/telemetry/Dockerfile + push: true + tags: ghcr.io/${{ github.repository }}-telemetry:${{ env.DOCKER_IMAGE_TAG }}