mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-07 21:29:13 +00:00
Ignore scheduled, closed, deadline time and logbook start, end in org node body
- Gives cleaner embeddings for semantic search - Hopefully improves results and reduces size, compute
This commit is contained in:
@@ -62,6 +62,7 @@ def makelist(filename):
|
|||||||
nodelist = []
|
nodelist = []
|
||||||
propdict = dict()
|
propdict = dict()
|
||||||
in_properties_drawer = False
|
in_properties_drawer = False
|
||||||
|
in_logbook_drawer = False
|
||||||
|
|
||||||
for line in f:
|
for line in f:
|
||||||
ctr += 1
|
ctr += 1
|
||||||
@@ -103,12 +104,17 @@ def makelist(filename):
|
|||||||
in_properties_drawer=False
|
in_properties_drawer=False
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Ignore Clocking Lines
|
# Ignore Logbook Drawer Start, End Lines
|
||||||
if re.search(r'CLOCK: \[[0-9]{4}-[0-9]{2}-[0-9]{2}', line):
|
if re.search(':LOGBOOK:', line):
|
||||||
|
in_logbook_drawer=True
|
||||||
|
continue
|
||||||
|
if in_logbook_drawer and re.search(':END:', line):
|
||||||
|
in_logbook_drawer=False
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if not in_properties_drawer and line[:1] != '#':
|
# Ignore Clocking Lines
|
||||||
bodytext = bodytext + line
|
if re.search(r'CLOCK: \[[0-9]{4}-[0-9]{2}-[0-9]{2}', line):
|
||||||
|
line = ""
|
||||||
|
|
||||||
prop_srch = re.search(r'^\s*:([a-zA-Z0-9]+):\s*(.*?)\s*$', line)
|
prop_srch = re.search(r'^\s*:([a-zA-Z0-9]+):\s*(.*?)\s*$', line)
|
||||||
if prop_srch:
|
if prop_srch:
|
||||||
@@ -118,7 +124,9 @@ def makelist(filename):
|
|||||||
else:
|
else:
|
||||||
propdict[prop_srch.group(1)] = prop_srch.group(2)
|
propdict[prop_srch.group(1)] = prop_srch.group(2)
|
||||||
continue
|
continue
|
||||||
sd_re = re.search(r'SCHEDULED:\s+<([0-9]+)\-([0-9]+)\-([0-9]+)', line)
|
|
||||||
|
cd_re = re.search(r'CLOSED:\s*\[([0-9]+)\-([0-9]+)\-([0-9]+)', line)
|
||||||
|
sd_re = re.search(r'SCHEDULED:\s*<([0-9]+)\-([0-9]+)\-([0-9]+)', line)
|
||||||
if sd_re:
|
if sd_re:
|
||||||
sched_date = datetime.date(int(sd_re.group(1)),
|
sched_date = datetime.date(int(sd_re.group(1)),
|
||||||
int(sd_re.group(2)),
|
int(sd_re.group(2)),
|
||||||
@@ -129,6 +137,10 @@ def makelist(filename):
|
|||||||
int(dd_re.group(2)),
|
int(dd_re.group(2)),
|
||||||
int(dd_re.group(3)) )
|
int(dd_re.group(3)) )
|
||||||
|
|
||||||
|
# Ignore property drawer, scheduled, closed, deadline and # lines from body
|
||||||
|
if not in_properties_drawer and not cd_re and not sd_re and not dd_re and line[:1] != '#':
|
||||||
|
bodytext = bodytext + line
|
||||||
|
|
||||||
# write out last node
|
# write out last node
|
||||||
thisNode = Orgnode(level, heading, bodytext, tag1, alltags)
|
thisNode = Orgnode(level, heading, bodytext, tag1, alltags)
|
||||||
thisNode.setProperties(propdict)
|
thisNode.setProperties(propdict)
|
||||||
|
|||||||
Reference in New Issue
Block a user