From 11517ba8eb3a968f1100caaedd3043d9310cf230 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 12 Feb 2023 17:33:23 -0600 Subject: [PATCH] Encode jsonl data as utf8 for gzip write for consistent read/write encoding Should help with issue #89 --- src/utils/jsonl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/jsonl.py b/src/utils/jsonl.py index 8a034acd..5874c17c 100644 --- a/src/utils/jsonl.py +++ b/src/utils/jsonl.py @@ -51,7 +51,7 @@ def compress_jsonl_data(jsonl_data, output_path): # Create output directory, if it doesn't exist output_path.parent.mkdir(parents=True, exist_ok=True) - with gzip.open(output_path, 'wt') as gzip_file: + with gzip.open(output_path, 'wt', encoding='utf-8') as gzip_file: gzip_file.write(jsonl_data) logger.info(f'Wrote jsonl data to gzip compressed jsonl at {output_path}') \ No newline at end of file