mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-03 21:29:08 +00:00
Use List, Tuple, Set from typing to support Python 3.8 for khoj
Before Python 3.9, you can't directly use list, tuple, set etc for type hinting Resolves #130
This commit is contained in:
@@ -2,7 +2,7 @@
|
||||
import glob
|
||||
import re
|
||||
import logging
|
||||
import time
|
||||
from typing import List
|
||||
|
||||
# Internal Packages
|
||||
from src.processor.text_to_jsonl import TextToJsonl
|
||||
@@ -109,7 +109,7 @@ class BeancountToJsonl(TextToJsonl):
|
||||
return entries, dict(transaction_to_file_map)
|
||||
|
||||
@staticmethod
|
||||
def convert_transactions_to_maps(parsed_entries: list[str], transaction_to_file_map) -> list[Entry]:
|
||||
def convert_transactions_to_maps(parsed_entries: List[str], transaction_to_file_map) -> List[Entry]:
|
||||
"Convert each parsed Beancount transaction into a Entry"
|
||||
entries = []
|
||||
for parsed_entry in parsed_entries:
|
||||
@@ -120,6 +120,6 @@ class BeancountToJsonl(TextToJsonl):
|
||||
return entries
|
||||
|
||||
@staticmethod
|
||||
def convert_transaction_maps_to_jsonl(entries: list[Entry]) -> str:
|
||||
def convert_transaction_maps_to_jsonl(entries: List[Entry]) -> str:
|
||||
"Convert each Beancount transaction entry to JSON and collate as JSONL"
|
||||
return ''.join([f'{entry.to_json()}\n' for entry in entries])
|
||||
|
||||
@@ -3,6 +3,7 @@ import glob
|
||||
import re
|
||||
import logging
|
||||
import time
|
||||
from typing import List
|
||||
|
||||
# Internal Packages
|
||||
from src.processor.text_to_jsonl import TextToJsonl
|
||||
@@ -110,7 +111,7 @@ class MarkdownToJsonl(TextToJsonl):
|
||||
return entries, dict(entry_to_file_map)
|
||||
|
||||
@staticmethod
|
||||
def convert_markdown_entries_to_maps(parsed_entries: list[str], entry_to_file_map) -> list[Entry]:
|
||||
def convert_markdown_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]:
|
||||
"Convert each Markdown entries into a dictionary"
|
||||
entries = []
|
||||
for parsed_entry in parsed_entries:
|
||||
@@ -121,6 +122,6 @@ class MarkdownToJsonl(TextToJsonl):
|
||||
return entries
|
||||
|
||||
@staticmethod
|
||||
def convert_markdown_maps_to_jsonl(entries: list[Entry]):
|
||||
def convert_markdown_maps_to_jsonl(entries: List[Entry]):
|
||||
"Convert each Markdown entry to JSON and collate as JSONL"
|
||||
return ''.join([f'{entry.to_json()}\n' for entry in entries])
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
import glob
|
||||
import logging
|
||||
import time
|
||||
from typing import Iterable
|
||||
from typing import Iterable, List
|
||||
|
||||
# Internal Packages
|
||||
from src.processor.org_mode import orgnode
|
||||
@@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
class OrgToJsonl(TextToJsonl):
|
||||
# Define Functions
|
||||
def process(self, previous_entries: list[Entry]=None):
|
||||
def process(self, previous_entries: List[Entry]=None):
|
||||
# Extract required fields from config
|
||||
org_files, org_file_filter, output_file = self.config.input_files, self.config.input_filter, self.config.compressed_jsonl
|
||||
index_heading_entries = self.config.index_heading_entries
|
||||
@@ -101,9 +101,9 @@ class OrgToJsonl(TextToJsonl):
|
||||
return entries, dict(entry_to_file_map)
|
||||
|
||||
@staticmethod
|
||||
def convert_org_nodes_to_entries(parsed_entries: list[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False) -> list[Entry]:
|
||||
def convert_org_nodes_to_entries(parsed_entries: List[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False) -> List[Entry]:
|
||||
"Convert Org-Mode nodes into list of Entry objects"
|
||||
entries: list[Entry] = []
|
||||
entries: List[Entry] = []
|
||||
for parsed_entry in parsed_entries:
|
||||
if not parsed_entry.hasBody and not index_heading_entries:
|
||||
# Ignore title notes i.e notes with just headings and empty body
|
||||
|
||||
@@ -37,6 +37,7 @@ import re
|
||||
import datetime
|
||||
from pathlib import Path
|
||||
from os.path import relpath
|
||||
from typing import List
|
||||
|
||||
indent_regex = re.compile(r'^ *')
|
||||
|
||||
@@ -69,7 +70,7 @@ def makelist(filename):
|
||||
sched_date = ''
|
||||
deadline_date = ''
|
||||
logbook = list()
|
||||
nodelist: list[Orgnode] = list()
|
||||
nodelist: List[Orgnode] = list()
|
||||
property_map = dict()
|
||||
in_properties_drawer = False
|
||||
in_logbook_drawer = False
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
# Standard Packages
|
||||
from abc import ABC, abstractmethod
|
||||
import hashlib
|
||||
import time
|
||||
import logging
|
||||
from typing import Callable
|
||||
from typing import Callable, List, Tuple
|
||||
from src.utils.helpers import timer
|
||||
|
||||
# Internal Packages
|
||||
@@ -18,16 +17,16 @@ class TextToJsonl(ABC):
|
||||
self.config = config
|
||||
|
||||
@abstractmethod
|
||||
def process(self, previous_entries: list[Entry]=None) -> list[tuple[int, Entry]]: ...
|
||||
def process(self, previous_entries: List[Entry]=None) -> List[Tuple[int, Entry]]: ...
|
||||
|
||||
@staticmethod
|
||||
def hash_func(key: str) -> Callable:
|
||||
return lambda entry: hashlib.md5(bytes(getattr(entry, key), encoding='utf-8')).hexdigest()
|
||||
|
||||
@staticmethod
|
||||
def split_entries_by_max_tokens(entries: list[Entry], max_tokens: int=256, max_word_length: int=500) -> list[Entry]:
|
||||
def split_entries_by_max_tokens(entries: List[Entry], max_tokens: int=256, max_word_length: int=500) -> List[Entry]:
|
||||
"Split entries if compiled entry length exceeds the max tokens supported by the ML model."
|
||||
chunked_entries: list[Entry] = []
|
||||
chunked_entries: List[Entry] = []
|
||||
for entry in entries:
|
||||
compiled_entry_words = entry.compiled.split()
|
||||
# Drop long words instead of having entry truncated to maintain quality of entry processed by models
|
||||
@@ -39,7 +38,7 @@ class TextToJsonl(ABC):
|
||||
chunked_entries.append(entry_chunk)
|
||||
return chunked_entries
|
||||
|
||||
def mark_entries_for_update(self, current_entries: list[Entry], previous_entries: list[Entry], key='compiled', logger=None) -> list[tuple[int, Entry]]:
|
||||
def mark_entries_for_update(self, current_entries: List[Entry], previous_entries: List[Entry], key='compiled', logger=None) -> List[Tuple[int, Entry]]:
|
||||
# Hash all current and previous entries to identify new entries
|
||||
with timer("Hash previous, current entries", logger):
|
||||
current_entry_hashes = list(map(TextToJsonl.hash_func(key), current_entries))
|
||||
|
||||
Reference in New Issue
Block a user