mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-03 05:29:12 +00:00
Ignore scheduled, closed, deadline time and logbook start, end in org node body
- Gives cleaner embeddings for semantic search - Hopefully improves results and reduces size, compute
This commit is contained in:
@@ -62,6 +62,7 @@ def makelist(filename):
|
||||
nodelist = []
|
||||
propdict = dict()
|
||||
in_properties_drawer = False
|
||||
in_logbook_drawer = False
|
||||
|
||||
for line in f:
|
||||
ctr += 1
|
||||
@@ -103,12 +104,17 @@ def makelist(filename):
|
||||
in_properties_drawer=False
|
||||
continue
|
||||
|
||||
# Ignore Clocking Lines
|
||||
if re.search(r'CLOCK: \[[0-9]{4}-[0-9]{2}-[0-9]{2}', line):
|
||||
# Ignore Logbook Drawer Start, End Lines
|
||||
if re.search(':LOGBOOK:', line):
|
||||
in_logbook_drawer=True
|
||||
continue
|
||||
if in_logbook_drawer and re.search(':END:', line):
|
||||
in_logbook_drawer=False
|
||||
continue
|
||||
|
||||
if not in_properties_drawer and line[:1] != '#':
|
||||
bodytext = bodytext + line
|
||||
# Ignore Clocking Lines
|
||||
if re.search(r'CLOCK: \[[0-9]{4}-[0-9]{2}-[0-9]{2}', line):
|
||||
line = ""
|
||||
|
||||
prop_srch = re.search(r'^\s*:([a-zA-Z0-9]+):\s*(.*?)\s*$', line)
|
||||
if prop_srch:
|
||||
@@ -118,7 +124,9 @@ def makelist(filename):
|
||||
else:
|
||||
propdict[prop_srch.group(1)] = prop_srch.group(2)
|
||||
continue
|
||||
sd_re = re.search(r'SCHEDULED:\s+<([0-9]+)\-([0-9]+)\-([0-9]+)', line)
|
||||
|
||||
cd_re = re.search(r'CLOSED:\s*\[([0-9]+)\-([0-9]+)\-([0-9]+)', line)
|
||||
sd_re = re.search(r'SCHEDULED:\s*<([0-9]+)\-([0-9]+)\-([0-9]+)', line)
|
||||
if sd_re:
|
||||
sched_date = datetime.date(int(sd_re.group(1)),
|
||||
int(sd_re.group(2)),
|
||||
@@ -129,6 +137,10 @@ def makelist(filename):
|
||||
int(dd_re.group(2)),
|
||||
int(dd_re.group(3)) )
|
||||
|
||||
# Ignore property drawer, scheduled, closed, deadline and # lines from body
|
||||
if not in_properties_drawer and not cd_re and not sd_re and not dd_re and line[:1] != '#':
|
||||
bodytext = bodytext + line
|
||||
|
||||
# write out last node
|
||||
thisNode = Orgnode(level, heading, bodytext, tag1, alltags)
|
||||
thisNode.setProperties(propdict)
|
||||
|
||||
Reference in New Issue
Block a user