catalyst-cooperative · zschira · Aug 13, 2024 · Aug 14, 2024 · Aug 20, 2024 · Aug 27, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -92,7 +92,7 @@ dev = [
 docs = [
     "doc8>=1,<2",  # Ensures clean documentation formatting
     "furo>=2022.4.7",
-    "sphinx>=6,<9",  # The default Python documentation engine
+    "sphinx>=6,<8.1",  # The default Python documentation engine
     "sphinx-autoapi>=2,<4",  # Generates documentation from docstrings
     "sphinx-issues>=1.2,<5",  # Allows references to GitHub issues
 
@@ -157,7 +157,7 @@ doctest_optionflags = [
 
 [tool.ruff]
 exclude = ["notebooks/*"]
-select = [
+lint.select = [
     "A", # flake8-builtins
     # "ARG", # unused arguments
     # "B",  # flake8-bugbear
@@ -185,7 +185,7 @@ select = [
     "UP", # pyupgrade (use modern python syntax)
     "W",  # pycodestyle warnings
 ]
-ignore = [
+lint.ignore = [
     "D401",   # Require imperative mood in docstrings.
     "D417",
     "E501",   # Overlong lines.
@@ -205,26 +205,26 @@ target-version = "py311"
 line-length = 88
 
 # Don't automatically concatenate strings -- sometimes we forget a comma!
-unfixable = ["ISC"]
+lint.unfixable = ["ISC"]
 
-[tool.ruff.per-file-ignores]
+[tool.ruff.lint.per-file-ignores]
 "__init__.py" = ["F401"]  # Ignore unused imports
 "tests/*" = ["D"]
 
-[tool.ruff.pep8-naming]
+[tool.ruff.lint.pep8-naming]
 # Allow Pydantic's `@validator` decorator to trigger class method treatment.
 classmethod-decorators = ["pydantic.validator", "pydantic.root_validator"]
 
-[tool.ruff.isort]
+[tool.ruff.lint.isort]
 known-first-party = ["pudl"]
 
-[tool.ruff.pydocstyle]
+[tool.ruff.lint.pydocstyle]
 convention = "google"
 
-[tool.ruff.mccabe]
+[tool.ruff.lint.mccabe]
 max-complexity = 10
 
-[tool.ruff.flake8-quotes]
+[tool.ruff.lint.flake8-quotes]
 docstring-quotes = "double"
 inline-quotes = "double"
 multiline-quotes = "double"

diff --git a/src/mozilla_sec_eia/library/generic_io_managers.py b/src/mozilla_sec_eia/library/generic_io_managers.py
@@ -1,5 +1,7 @@
 """Implement useful generic io-managers."""
 
+import pickle
+
 import pandas as pd
 from dagster import InputContext, OutputContext, UPathIOManager
 from upath import UPath
@@ -19,3 +21,19 @@ def load_from_path(self, context: InputContext, path: UPath) -> pd.DataFrame:
         """Read parquet."""
         with path.open("rb") as file:
             return pd.read_parquet(file)
+
+
+class PickleUPathIOManager(UPathIOManager):
+    """Read and write pandas dataframes as parquet files on local or remote filesystem."""
+
+    extension: str = ".pickle"
+
+    def dump_to_path(self, context: OutputContext, obj: pd.DataFrame, path: UPath):
+        """Write parquet."""
+        with path.open("wb") as file:
+            pickle.dump(obj, file)
+
+    def load_from_path(self, context: InputContext, path: UPath) -> pd.DataFrame:
+        """Read parquet."""
+        with path.open("rb") as file:
+            return pickle.load(file)  # noqa: S301
diff --git a/src/mozilla_sec_eia/library/mlflow/__init__.py b/src/mozilla_sec_eia/library/mlflow/__init__.py
@@ -1,9 +1,13 @@
 """Implement tooling to interface with mlflow experiment tracking."""
 
+from dagster import Config, asset
+from pydantic import create_model
+
 from .mlflow_io_managers import (
     MlflowBaseIOManager,
     MlflowMetricsIOManager,
     MlflowPandasArtifactIOManager,
+    MlflowPyfuncModelIOManager,
 )
 from .mlflow_resource import (
     MlflowInterface,
@@ -12,6 +16,22 @@
 )
 
 
+def pyfunc_model_asset_factory(name: str, mlflow_run_uri: str):
+    """Create asset for loading a model logged to mlflow."""
+    PyfuncConfig = create_model(  # NOQA: N806
+        f"PyfuncConfig{name}", mlflow_run_uri=(str, mlflow_run_uri), __base__=Config
+    )
+
+    @asset(
+        name=name,
+        io_manager_key="pyfunc_model_io_manager",
+    )
+    def _model_asset(config: PyfuncConfig):
+        return config.mlflow_run_uri
+
+    return _model_asset
+
+
 def get_mlflow_io_manager(
     key: str,
     mlflow_interface: MlflowInterface | None = None,

diff --git a/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py
@@ -26,6 +26,32 @@ def _get_run_info(self) -> Run:
         return mlflow.get_run(self.mlflow_interface.mlflow_run_id)
 
 
+class MlflowPyfuncModelIOManager(MlflowBaseIOManager):
+    """IO Manager to load pyfunc models from tracking server."""
+
+    uri: str | None = None
+
+    def handle_output(self, context: OutputContext, model_uri: str):
+        """Takes model uri as a string and caches the model locally for future use."""
+        cache_path = self.mlflow_interface.dagster_home_path / "model_cache"
+        cache_path.mkdir(exist_ok=True, parents=True)
+
+        logger.info(f"Caching {context.name} model at {cache_path}")
+        mlflow.pyfunc.load_model(
+            model_uri,
+            dst_path=cache_path,
+        )
+
+    def load_input(self, context: InputContext):
+        """Load pyfunc model with mlflow server."""
+        cache_path = (
+            self.mlflow_interface.dagster_home_path / "model_cache" / context.name
+        )
+        logger.info(f"Loading {context.name} model from {cache_path}")
+
+        return mlflow.pyfunc.load_model(cache_path)
+
+
 class MlflowPandasArtifactIOManager(MlflowBaseIOManager):
     """Implement IO manager for logging/loading dataframes as mlflow artifacts."""
 

diff --git a/src/mozilla_sec_eia/library/model_jobs.py b/src/mozilla_sec_eia/library/model_jobs.py
@@ -25,6 +25,7 @@ def create_production_model_job(
     job_name: str,
     assets: list[AssetsDefinition],
     concurrency_limit: int | None = None,
+    tag_concurrency_limits: list[dict] | None = None,
     **kwargs,
 ) -> JobDefinition:
     """Construct a dagster job and supply Definitions with assets and resources."""
@@ -39,10 +40,16 @@ def create_production_model_job(
             }
         },
     }
-    if concurrency_limit is not None:
-        config["execution"] = {
-            "config": {"multiprocess": {"max_concurrent": concurrency_limit}}
-        }
+    if (concurrency_limit is not None) or (tag_concurrency_limits is not None):
+        config["execution"] = {"config": {"multiprocess": {}}}
+        if concurrency_limit is not None:
+            config["execution"]["config"]["multiprocess"][
+                "max_concurrent"
+            ] = concurrency_limit
+        else:
+            config["execution"]["config"]["multiprocess"][
+                "tag_concurrency_limits"
+            ] = tag_concurrency_limits
 
     return define_asset_job(
         job_name,

diff --git a/src/mozilla_sec_eia/models/sec10k/README.rst b/src/mozilla_sec_eia/models/sec10k/README.rst
@@ -3,6 +3,86 @@ sec10k: Extracting company ownership data from sec10k documents
 
 This repo contains exploratory development for an SEC-EIA linkage.
 
+Models
+------
+Basic 10k
+^^^^^^^^^
+The extraction model for basic 10k company information is very simple and requires no
+training. This model is implemented as a simple rules based parser that finds key-value
+pairs containing company information, which is embedded in a header for all 10k filings.
+
+Exhibit 21
+^^^^^^^^^^
+Exhibit 21 extraction is much more complicated and requires pretrained models that are
+cached with our mlflow tracking server. Currently, there are 2 models which are
+implemented in the ``notebooks/`` directory. These notebooks use
+`Dagstermill <https://docs.dagster.io/integrations/dagstermill/using-notebooks-with-dagster>`_
+so they can be run interactively like any normal Jupyter Notebook, or run in a Dagster
+job.
+
+Extraction
+""""""""""
+The primary extraction model is implemented in the ``notebooks/exhibit21_extractor.ipynb``.
+This model is based on
+`layoutlm <https://huggingface.co/microsoft/layoutlmv3-base>`_ with custom inference logic
+to construct a table of ownership information from an exhibit 21 document. Both the
+layoutlm model and the inference model are logged separately with mlflow. This
+separation between the models allows for testing minor modifications to the inference
+portion with the same pretrained layoutlm model.
+
+There are currently two configuration parameters that used by the extraction model
+notebook:
+
+* ``layoutlm_training_run``: This should be an existing mlflow run name, which was used
+  to train layoutlm, and has a logged model associated with it. If ``None`` layoutlm
+  will be trained when the notebook is run, and the new training run will be used for
+  inference and validation.
+* ``training_data_version``: This should point to a GCS folder containing training
+  data to use with layoutlm. If ``layoutlm_training_run`` is set, then this parameter
+  doesn't matter, as layoutlm will not be re-trained when the notebook is executed.
+
+The notebook also depends on several upstream dagster assets, which produce training and
+validation datasets. Using upstream assets allows these datasets, which are relatively
+expensive to produce, to be easily cached and reused while interating on the model.
+These upstream assets need to be produced before the notebook can be run. They should
+also be re-materialized if you want to modify the training or validation data, otherwise
+the notebook can be re-run as many times as desired with existing data.
+
+Layout Classification
+"""""""""""""""""""""
+The second model is a classifier, which labels filings as either having a 'paragraph'
+layout or not. This is done because the extraction model performs poorly on documents
+formatted as paragraphs rather than tables. For now we will likely just filter out these
+results, but we could also develop a separate extraction model which handles these
+documents better.
+
+This model is located in ``notebooks/exhibit21_layout_classifier.ipynb``, and it also
+depends on upstream assets to produce training data, which will need
+to be produced before running the notebook.
+
+Training the Models
+"""""""""""""""""""
+The models are trained by running the notebooks. This can be done either interactively
+like a normal notebook or through dagster directly.
+
+Whether running interactively or with dagster, you will first need to produce the
+upstream data assets:
+
+1. Launch dagster from the repo root with the ``dagster dev`` command
+2. Locate the training Job in question using the webui
+3. Select the upstream assets by holding down the shift key and clicking on each
+   asset excluding the notebook asset
+4. Click ``Materialize all`` in the UI
+
+Once this is complete, you can simply launch ``Jupyter`` and run the notebooks
+interactively as you would any other notebook. The first cell loads the upstream
+assets and sets configuration. You can modify the configuration directly in the
+notebook as normal.
+
+To run the notebook in dagster, you simply execute it like any other normal asset.
+You can first set configuration in the dagster launchpad if desired, and when it
+completes executing, you can click on the asset to view the fully rendered notebook.
+
 Usage
 -----