washingtonpost · NickCrews · Apr 6, 2023 · Apr 6, 2023 · Apr 6, 2023 · Apr 6, 2023
@@ -5,7 +5,7 @@
 bin
 *.test
 *.fec
-!python/tests/fixtures/*.fec
+!python/tests/cases/**/*.fec
 output/
 env/
 zig-cache
@@ -38,4 +38,4 @@ share/python-wheels/
 .installed.cfg
 *.egg
 *.whl
-MANIFEST
+MANIFEST
@@ -1 +1 @@
-0.1.9
+0.1.10
@@ -85,11 +85,15 @@ pub fn build(b: *std.build.Builder) !void {
 
 const libSources = [_][]const u8{
     "src/buffer.c",
+    "src/mappings.c",
     "src/memory.c",
+    "src/path.c",
     "src/encoding.c",
     "src/csv.c",
     "src/writer.c",
     "src/fec.c",
+    "src/regex.c",
+    "src/string_utils.c",
 };
 const pcreSources = [_][]const u8{
     "src/pcre/pcre_chartables.c",
@@ -114,12 +118,23 @@ const pcreSources = [_][]const u8{
     "src/pcre/pcre_version.c",
     "src/pcre/pcre_xclass.c",
 };
-const tests = [_][]const u8{ "src/buffer_test.c", "src/csv_test.c", "src/writer_test.c", "src/cli_test.c" };
-const testIncludes = [_][]const u8{ "src/buffer.c", "src/memory.c", "src/encoding.c", "src/csv.c", "src/writer.c", "src/cli.c" };
+const tests = [_][]const u8{
+    "src/buffer_test.c",
+    "src/csv_test.c",
+    "src/writer_test.c",
+    "src/cli_test.c",
+    "src/fec_test.c",
+};
+const testIncludes = libSources ++ [_][]const u8{
+    "src/cli.c",
+};
 const buildOptions = [_][]const u8{
     "-std=c11",
     "-pedantic",
     "-Wall",
     "-W",
     "-Wno-missing-field-initializers",
+    // The string literals in mappings_generated.h are super long, which gives us
+    // warnings, but this isn't actually a problem unless we used som ancient compiler.
+    "-Wno-overlength-strings",
 };
@@ -1,17 +1,18 @@
-"""
-A Python library to interface with FastFEC.
+"""A Python library to interface with FastFEC.
 
 This library provides methods to
   * parse a .fec file line by line, yieling a parsed result
   * parse a .fec file into parsed output .csv files
 """
+from __future__ import annotations
 
 import contextlib
-import os
+import io
 import pathlib
 from ctypes import CDLL, c_char_p, c_int, c_void_p
 from queue import Queue
 from threading import Thread
+from typing import Any, Generator
 
 from .utils import (
     BUFFER_READ,
@@ -27,29 +28,39 @@
 
 
 class LibFastFEC:
-    """
-    Python wrapper for the fastfec library
-    """
+    """Python wrapper for the fastfec library."""
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.__init_lib()
 
         # Initialize
         self.persistent_memory_context = self.libfastfec.newPersistentMemoryContext()
 
-    def parse(self, file_handle, include_filing_id=None, should_parse_date=True):
-        """
-        Parses the input file line-by-line
+    def parse(
+        self,
+        file_handle: io.BinaryIO,
+        include_filing_id: str | None = None,
+        should_parse_date: bool = True,
+        raw: bool = False,
+    ) -> Generator[tuple[str, dict[str, Any]], None, None]:
+        """Parses the input file line-by-line.
 
         Arguments:
+        ---------
             file_handle -- An input stream for reading a .fec file
-            include_filing_id -- If set, prepend a column into each outputted csv for filing_id
-                                 with the specified filing id (defaults to None)
-            should_parse_date -- If true, yields parsed datetime.date objects for date fields; if
-                                 false, yields strings for date fields. This would mainly be set to
-                                 false for performance reasons (defaults to true)
+            include_filing_id -- If set, prepend a column into each outputted csv
+                                 for filing_id with the specified filing id.
+            should_parse_date -- If True, date fields are parsed to datetime.date.
+                                 If False, date fields are returned as raw YYYY-MM-DD
+                                 strings. This would mainly be set to false for
+                                 performance reasons.
+            raw -- If True, if there are fewer or more fields in a row than we
+                   expect, the row will be written to the output file as-is.
+                   If False, we will add empty fields, or skip extra fields,
+                   to the row to make it the correct length.
 
         Returns:
+        -------
             A generator that receives the form name and a dictionary
             object describing each line in the file
         """
@@ -58,26 +69,28 @@ def parse(self, file_handle, include_filing_id=None, should_parse_date=True):
         done_processing = object()  # A custom object to signal the end of processing
 
         # Prepare the filing id to include, if specified
-        include_filing_id = as_bytes(include_filing_id)
-        filing_id_included = include_filing_id is not None
+        filing_id = as_bytes(include_filing_id)
+        filing_id_included = filing_id is not None
 
         # Provide a custom line callback
         buffer_read_fn = provide_read_callback(file_handle)
-        line_callback_fn = CUSTOM_LINE(provide_line_callback(queue, filing_id_included, should_parse_date))
+        line_callback_fn = CUSTOM_LINE(
+            provide_line_callback(queue, filing_id_included, should_parse_date),
+        )
         fec_context = self.libfastfec.newFecContext(
-            self.persistent_memory_context,
-            buffer_read_fn,
-            BUFFER_SIZE,
-            CUSTOM_WRITE(0),
-            BUFFER_SIZE,
-            line_callback_fn,
-            0,
-            None,
-            include_filing_id,
-            None,
-            filing_id_included,
-            1,
-            0,
+            self.persistent_memory_context,  # persistentMemory
+            buffer_read_fn,  # bufferRead
+            BUFFER_SIZE,  # inputBufferSize
+            CUSTOM_WRITE(0),  # customWriteFunction
+            BUFFER_SIZE,  # outputBufferSize
+            line_callback_fn,  # customLineFunction
+            0,  # writeToFile
+            None,  # file
+            None,  # outputDirectory
+            filing_id,  # filingId
+            1,  # silent
+            0,  # warn
+            raw,  # raw
         )
 
         # Run the parsing in a separate thread. It's essentially still single-threaded
@@ -87,7 +100,7 @@ def task():
             self.libfastfec.parseFec(fec_context)
             queue.put(done_processing)  # Signal processing is over
 
-        Thread(target=task, args=()).start()
+        Thread(target=task, args=(), daemon=True).start()
 
         # Yield processed lines
         while True:
@@ -101,69 +114,97 @@ def task():
         # Free FEC context
         self.libfastfec.freeFecContext(fec_context)
 
-    def parse_as_files(self, file_handle, output_directory, include_filing_id=None):
-        """
-        Parses the input file into output files in the output directory
+    def parse_as_files(
+        self,
+        file_handle: io.BinaryIO,
+        output_directory: str | pathlib.Path,
+        include_filing_id: str | None = None,
+        raw: bool = False,
+    ) -> int:
+        """Parses the input file into output files in the output directory.
 
         Parent directories will be automatically created as needed.
 
         Arguments:
+        ---------
             file_handle -- An input stream for reading a .fec file
             output_directory -- A directory in which to place output parsed .csv files
-            include_filing_id -- If set, prepend a column into each outputted csv for filing_id
-                                 with the specified filing id (defaults to None)
+            include_filing_id -- If set, prepend a column `filing_id` into each
+                                 outputted csv filled with the specified value.
+            raw -- If True, if there are fewer or more fields in a row than we
+                   expect, the row will be written to the output file as-is.
+                   If False, we will add empty fields, or skip extra fields,
+                   to the row to make it the correct length.
 
         Returns:
+        -------
             A status code. 1 indicates a successful parse, 0 an unsuccessful one.
         """
+        out_path = pathlib.Path(output_directory)
 
         # Custom open method
-        def open_output_file(filename, *args, **kwargs):
-            filename = os.path.join(output_directory, filename)
-            output_file = pathlib.Path(filename)
-            output_file.parent.mkdir(exist_ok=True, parents=True)
+        def open_output_file(form_type: str, *args, **kwargs):
+            form_type = form_type.replace("/", "-")
+            path = out_path / form_type
+            path.parent.mkdir(exist_ok=True, parents=True)
             # pylint: disable=consider-using-with,unspecified-encoding,bad-option-value
-            return open(filename, *args, **kwargs)
+            return open(path, *args, **kwargs)
 
-        return self.parse_as_files_custom(file_handle, open_output_file, include_filing_id=include_filing_id)
+        return self.parse_as_files_custom(
+            file_handle,
+            open_output_file,
+            include_filing_id=include_filing_id,
+            raw=raw,
+        )
 
-    def parse_as_files_custom(self, file_handle, open_function, include_filing_id=None):
-        """
-        Parses the input file into output files
+    def parse_as_files_custom(
+        self,
+        file_handle: io.BinaryIO,
+        open_function,
+        include_filing_id: str | None = None,
+        raw: bool = False,
+    ) -> int:
+        """Parses the input file into output files.
 
         Arguments:
+        ---------
             file_handle -- An input stream for reading a .fec file
-            open_function -- A function to open an output file for writing. This can be set to
-                             customize the output stream for each parsed .csv file
-            include_filing_id -- If set, prepend a column into each outputted csv for filing_id
-                                 with the specified filing id (defaults to None)
+            open_function -- A function to open an output file for writing, given
+                             a form type. This can be set to customize the output
+                             stream for each parsed .csv file
+            include_filing_id -- If set, prepend a column `filing_id` into each
+                                 outputted csv filled with the specified value.
+            raw -- If True, if there are fewer or more fields in a row than we
+                   expect, the row will be written to the output file as-is.
+                   If False, we will add empty fields, or skip extra fields,
+                   to the row to make it the correct length.
 
         Returns:
+        -------
             A status code. 1 indicates a successful parse, 0 an unsuccessful one.
         """
         # Set callbacks
         buffer_read_fn = provide_read_callback(file_handle)
         write_callback_fn, free_file_descriptors = provide_write_callback(open_function)
 
         # Prepare the filing id to include, if specified
-        include_filing_id = as_bytes(include_filing_id)
-        filing_id_included = include_filing_id is not None
+        filing_id = as_bytes(include_filing_id)
 
         # Initialize fastfec context
         fec_context = self.libfastfec.newFecContext(
-            self.persistent_memory_context,
-            buffer_read_fn,
-            BUFFER_SIZE,
-            write_callback_fn,
-            BUFFER_SIZE,
-            CUSTOM_LINE(0),
-            0,
-            None,
-            include_filing_id,
-            None,
-            filing_id_included,
-            1,
-            0,
+            self.persistent_memory_context,  # persistentMemory
+            buffer_read_fn,  # bufferRead
+            BUFFER_SIZE,  # inputBufferSize
+            write_callback_fn,  # customWriteFunction
+            BUFFER_SIZE,  # outputBufferSize
+            CUSTOM_LINE(0),  # customLineFunction
+            0,  # writeToFile
+            None,  # file
+            None,  # outputDirectory
+            filing_id,  # filingId
+            1,  # silent
+            0,  # warn
+            raw,  # raw
         )
 
         # Parse
@@ -175,13 +216,11 @@ def parse_as_files_custom(self, file_handle, open_function, include_filing_id=No
 
         return result
 
-    def free(self):
-        """
-        Frees all the allocated memory from the fastfec library
-        """
+    def free(self) -> None:
+        """Frees all the allocated memory from the fastfec library."""
         self.libfastfec.freePersistentMemoryContext(self.persistent_memory_context)
 
-    def __init_lib(self):
+    def __init_lib(self) -> None:
         # Find the fastfec library
         self.libfastfec = CDLL(find_fastfec_lib())
 
@@ -190,19 +229,19 @@ def __init_lib(self):
         self.libfastfec.newPersistentMemoryContext.restype = c_void_p
 
         self.libfastfec.newFecContext.argtypes = [
-            c_void_p,
-            BUFFER_READ,
-            c_int,
-            CUSTOM_WRITE,
-            c_int,
-            CUSTOM_LINE,
-            c_int,
-            c_void_p,
-            c_char_p,
-            c_char_p,
-            c_int,
-            c_int,
-            c_int,
+            c_void_p,  # persistentMemory
+            BUFFER_READ,  # bufferRead
+            c_int,  # inputBufferSize
+            CUSTOM_WRITE,  # customWriteFunction
+            c_int,  # outputBufferSize
+            CUSTOM_LINE,  # customLineFunction
+            c_int,  # writeToFile
+            c_void_p,  # file
+            c_char_p,  # outputDirectory
+            c_char_p,  # filingId
+            c_int,  # silent
+            c_int,  # warn
+            c_int,  # raw
         ]
         self.libfastfec.newFecContext.restype = c_void_p
         self.libfastfec.parseFec.argtypes = [c_void_p]
@@ -212,9 +251,8 @@ def __init_lib(self):
 
 
 @contextlib.contextmanager
-def FastFEC():  # pylint: disable=invalid-name
-    """
-    A convenience method to run fastfec and free memory afterwards
+def FastFEC() -> Generator[LibFastFEC, None, None]:  # pylint: disable=invalid-name
+    """A convenience method to run fastfec and free memory afterwards.
 
     Usage: