From b68558651bbe115b9452266760f7472e150f8542 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 26 Feb 2022 17:36:30 -0500 Subject: [PATCH] Improve Extraction of Beancount Entries - Only extract entries starting with YYYY-MM-DD from Beancount - Strip Trailing Escape Sequences from Entries --- src/processor/ledger/beancount_to_jsonl.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/processor/ledger/beancount_to_jsonl.py b/src/processor/ledger/beancount_to_jsonl.py index 99c9d5d5..aa09ffa3 100644 --- a/src/processor/ledger/beancount_to_jsonl.py +++ b/src/processor/ledger/beancount_to_jsonl.py @@ -6,6 +6,7 @@ import argparse import pathlib import glob import gzip +import re # Internal Packages from src.processor.org_mode import orgnode @@ -110,11 +111,19 @@ def get_beancount_files(beancount_files=None, beancount_file_filter=None, verbos def extract_beancount_entries(beancount_files): "Extract entries from specified Beancount files" + + # Initialize Regex for extracting Beancount Entries + date_regex = r'^\n?\d{4}-\d{2}-\d{2}' + empty_newline = r'^[\n\r\t ]*$' + entries = [] for beancount_file in beancount_files: with open(beancount_file) as f: - entries.extend( - f.read().split('\n\n')) + ledger_content = f.read() + entries.extend([entry.strip('\n|\r|\t| ') + for entry + in re.split(empty_newline, ledger_content, flags=re.MULTILINE) + if re.match(date_regex, entry)]) return entries