Merge pull request #9 from catalyst-cooperative/dev

Flesh out integration tests.
catalyst-cooperative · May 20, 2022 · 4fb428c · 4fb428c
2 parents 492df1a + f8e2ffe
commit 4fb428c
Show file tree

Hide file tree

Showing 9 changed files with 246 additions and 56 deletions.
diff --git a/.mypy.ini b/.mypy.ini
@@ -12,6 +12,9 @@ ignore_missing_imports = True
 [mypy-fsspec.*]
 ignore_missing_imports = True
 
+[mypy-intake.*]
+ignore_missing_imports = True
+
 [mypy-intake_sql.*]
 ignore_missing_imports = True
 

diff --git a/setup.py b/setup.py
@@ -67,6 +67,7 @@
             "flake8-use-fstring>=1,<2",  # Highlight use of old-style string formatting
             "fsspec[http]",  # Extras required for our specific test cases.
             "mccabe>=0.6,<0.8",  # Checks that code isn't overly complicated
+            "msgpack-numpy>=0.4,<0.5",  # Required to serialize Numpy arrays
             "mypy>=0.942",  # Static type checking
             "pep8-naming>=0.12,<0.13",  # Require PEP8 compliant variable names
             "pre-commit>=2.9,<3",  # Allow us to run pre-commit hooks in testing

diff --git a/src/intake_sqlite/sqlite_src.py b/src/intake_sqlite/sqlite_src.py
@@ -4,6 +4,7 @@
 import logging
 from pathlib import Path
 from typing import Any
+from urllib.parse import urlparse
 
 import fsspec
 from intake_sql import SQLSource, SQLSourceAutoPartition, SQLSourceManualPartition
@@ -144,14 +145,24 @@ def __init__(
 
 def urlpath_to_sqliteurl(urlpath: str, open_kwargs: dict[str, Any] = {}) -> str:
     """Transform a file path or URL into a local SQLite URL."""
-    if Path(urlpath).is_file():
-        p = Path(urlpath)
-        if p.suffix not in SQLITE_SUFFIXES:
-            raise ValueError(
-                f"Expected a SQLite file ending in one of: {SQLITE_SUFFIXES} "
-                f"but got: {p.name}"
-            )
+    parsed = urlparse(urlpath)
+    p = Path(parsed.path)
+    if p.suffix not in SQLITE_SUFFIXES:
+        raise ValueError(
+            f"Expected a SQLite file path ending in one of: {SQLITE_SUFFIXES} "
+            f"but got: {p.name}"
+        )
+    if parsed.scheme != "" and parsed.scheme not in fsspec.available_protocols():
+        raise ValueError(f"URL protocol {parsed.scheme} is not supported by fsspec.")
+    if parsed.scheme == "" and not p.is_file():
+        raise ValueError(f"Local path {p} is not a file!")
+    # At this point we know that EITHER:
+    # * urlpath is a URL supported by fsspec that looks like an SQLite file OR
+    # * p is a local file that looks like an SQLite file
+    if parsed.scheme == "":
+        # Absolute path to the local SQLite DB:
         local_db_path = p.resolve()
     else:
+        # Absolute path to the locally cached SQLite DB:
         local_db_path = fsspec.open_local("simplecache::" + urlpath, **open_kwargs)
-    return "sqlite:///" + str(local_db_path)
+    return f"sqlite:///{local_db_path}"
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,34 +1,71 @@
 """PyTest configuration module. Defines useful fixtures, command line args."""
+from __future__ import annotations
+
 import logging
+import tempfile
+from collections.abc import Generator
 from pathlib import Path
 
+import numpy as np
+import pandas as pd
 import pytest
+import sqlalchemy as sa
 
 logger = logging.getLogger(__name__)
 
 
-def pytest_addoption(parser: pytest.Parser) -> None:
-    """Add package-specific command line options to pytest.
-
-    This is slightly magical -- pytest has a hook that will run this function
-    automatically, adding any options defined here to the internal pytest options that
-    already exist.
-    """
-    parser.addoption(
-        "--sandbox",
-        action="store_true",
-        default=False,
-        help="Flag to indicate that the tests should use a sandbox.",
+@pytest.fixture(scope="session")
+def df1() -> pd.DataFrame:
+    """A dataframe with a named primary key."""
+    df = pd.DataFrame(
+        {
+            "a": np.random.rand(100).tolist(),
+            "b": np.random.randint(100, size=100).tolist(),
+            "c": np.random.choice(["a", "b", "c", "d"], size=100).tolist(),
+        }
     )
+    df.index.name = "pk"
+    return df
 
 
 @pytest.fixture(scope="session")
-def test_dir() -> Path:
-    """Return the path to the top-level directory containing the tests.
+def df2() -> pd.DataFrame:
+    """A dataframe with no primary key."""
+    return pd.DataFrame(
+        {
+            "d": np.random.rand(100).tolist(),
+            "e": np.random.randint(100, size=100).tolist(),
+            "f": np.random.choice(["a", "b", "c", "d"], size=100).tolist(),
+        }
+    )
 
-    This might be useful if there's test data stored under the tests directory that
-    you need to be able to access from elsewhere within the tests.
 
-    Mostly this is meant as an example of a fixture.
-    """
-    return Path(__file__).parent
+@pytest.fixture(scope="session")
+def temp_db(
+    df1: pd.DataFrame,
+    df2: pd.DataFrame,
+) -> Generator[tuple[str, str, str], None, None]:
+    """Create a temporary SQLite DB for use in testing."""
+    urlpath = Path(tempfile.mkstemp(suffix=".db")[1])
+    engine = sa.create_engine(f"sqlite:///{urlpath}")
+    with engine.connect() as con:
+        con.execute(
+            """CREATE TABLE temp (
+            pk BIGINT PRIMARY KEY,
+            a REAL NOT NULL,
+            b BIGINT NOT NULL,
+            c TEXT NOT NULL);"""
+        )
+        con.execute(
+            """CREATE TABLE temp_nopk (
+            d REAL NOT NULL,
+            e BIGINT NOT NULL,
+            f TEXT NOT NULL);"""
+        )
+        df1.to_sql("temp", con=con, if_exists="append")
+        df2.to_sql("temp_nopk", con=con, if_exists="append", index=False)
+    try:
+        yield "temp", "temp_nopk", str(urlpath)
+    finally:
+        if urlpath.is_file():
+            urlpath.unlink()
diff --git a/tests/integration/intake_sqlite_test.py b/tests/integration/intake_sqlite_test.py
diff --git a/tests/integration/sqlite_cat_test.py b/tests/integration/sqlite_cat_test.py
@@ -0,0 +1,39 @@
+"""SQLite Catalog integration tests."""
+from __future__ import annotations
+
+import logging
+
+import intake
+import pandas as pd
+from pandas.testing import assert_frame_equal
+
+from intake_sqlite import SQLiteCatalog
+
+# pytest imports this package last, so plugin is not auto-added
+intake.register_driver(name="sqlite_cat", driver=SQLiteCatalog)
+
+logger = logging.getLogger(__name__)
+
+
+def test_local_sqlite_catalog(
+    temp_db: tuple[str, str, str],
+    df1: pd.DataFrame,
+    df2: pd.DataFrame,
+) -> None:
+    """Test reading tables from a local SQLite catalog."""
+    table, table_nopk, urlpath = temp_db
+    cat = SQLiteCatalog(urlpath)
+    assert table in cat  # nosec: B101
+    assert table_nopk in cat  # nosec: B101
+    actual_pk = getattr(cat, table).read()
+    assert_frame_equal(df1, actual_pk)
+    actual_nopk = getattr(cat, table_nopk).read()
+    assert_frame_equal(df2, actual_nopk)
+
+
+def test_remote_sqlite_catalog() -> None:
+    """Test ability to create and access a remote SQLiteCatalog."""
+    gpp_cat = SQLiteCatalog(
+        urlpath="https://global-power-plants.datasettes.com/global-power-plants.db",
+    )
+    assert "global-power-plants" in gpp_cat  # nosec: B101
diff --git a/tests/integration/sqlite_src_test.py b/tests/integration/sqlite_src_test.py
@@ -0,0 +1,73 @@
+"""SQLite Intake Source integration tests."""
+from __future__ import annotations
+
+import logging
+
+import intake
+import pandas as pd
+from pandas.testing import assert_frame_equal
+
+from intake_sqlite import (
+    SQLiteSource,
+    SQLiteSourceAutoPartition,
+    SQLiteSourceManualPartition,
+)
+
+# pytest imports this package last, so plugin is not auto-added
+intake.register_driver(name="sqlite", driver=SQLiteSource)
+intake.register_driver(name="sqlite_auto", driver=SQLiteSourceAutoPartition)
+intake.register_driver(name="sqlite_manual", driver=SQLiteSourceManualPartition)
+
+logger = logging.getLogger(__name__)
+
+
+def test_temp_db_fixture(temp_db: tuple[str, str, str], df1: pd.DataFrame) -> None:
+    """Make sure a direct read from the temp DB works."""
+    table, table_nopk, urlpath = temp_db
+    actual = pd.read_sql(table, f"sqlite:///{urlpath}", index_col="pk")
+    assert_frame_equal(df1, actual)
+
+
+def test_simple_src(temp_db: tuple[str, str, str], df1: pd.DataFrame) -> None:
+    """Test simple table read from the SQLite catalog."""
+    table, table_nopk, urlpath = temp_db
+    actual = SQLiteSource(urlpath, table, sql_kwargs=dict(index_col="pk")).read()
+    assert_frame_equal(df1, actual)
+
+
+def test_auto_src_partition(temp_db: tuple[str, str, str], df1: pd.DataFrame) -> None:
+    """Test automatic partitioning of table."""
+    table, table_nopk, urlpath = temp_db
+    s = SQLiteSourceAutoPartition(
+        urlpath, table, index="pk", sql_kwargs=dict(npartitions=2)
+    )
+    assert s.discover()["npartitions"] == 2  # nosec: B101
+    assert s.to_dask().npartitions == 2  # nosec: B101
+    actual = s.read()
+    assert_frame_equal(df1, actual)
+
+
+def test_manual_src_partition(temp_db: tuple[str, str, str], df1: pd.DataFrame) -> None:
+    """Test manual partitioning of table."""
+    table, table_nopk, urlpath = temp_db
+    table, table_nopk, urlpath = temp_db
+    s = SQLiteSourceManualPartition(
+        urlpath,
+        "SELECT * FROM " + table,  # nosec: B608
+        where_values=["WHERE pk < 20", "WHERE pk >= 20"],
+        sql_kwargs=dict(index_col="pk"),
+    )
+    assert s.discover()["npartitions"] == 2  # nosec: B101
+    assert s.to_dask().npartitions == 2  # nosec: B101
+    actual = s.read()
+    assert_frame_equal(df1, actual)
+
+
+def test_remote_sqlite_source() -> None:
+    """Test ability to create and access remote SQLiteSource."""
+    gpp_src = SQLiteSource(
+        urlpath="https://global-power-plants.datasettes.com/global-power-plants.db",
+        sql_expr="SELECT * FROM 'global-power-plants'",
+    )
+    df = gpp_src.read()
+    assert df.shape == (34936, 36)  # nosec: B101
diff --git a/tests/unit/intake_sqlite_test.py b/tests/unit/intake_sqlite_test.py
@@ -1,17 +1,66 @@
-"""A dummy unit test so pytest has something to do."""
+"""SQLite Intake Catalog unit tests."""
+from __future__ import annotations
+
 import logging
 from pathlib import Path
 
+import pytest
+
 from intake_sqlite import urlpath_to_sqliteurl
 
 logger = logging.getLogger(__name__)
 
-TEST_DIR = Path(__file__).parent.parent.resolve()
+DATA_DIR = Path(__file__).resolve().parents[1] / "data"
+
+BAD_FILES: list[tuple[str, type[Exception]]] = [
+    ("database.wtf", ValueError),
+    ("dbdump.sql", ValueError),
+    ("nonexistent.db", ValueError),
+    ("nonexistent.sqlite", ValueError),
+]
+
+BAD_URLS: list[tuple[str, type[Exception]]] = [
+    ("https://catalyst.coop/pudl.wtf", ValueError),
+    ("s3://catalyst.coop/pudl.dude", ValueError),
+    ("gs://catalyst.coop/pudl.sql", ValueError),
+    ("wtftp://catalyst.coop/pudl.sqlite", ValueError),
+    ("wtftp://catalyst.coop/pudl.db", ValueError),
+]
+
+
+@pytest.mark.parametrize("filename,exc", BAD_FILES)
+def test_bad_filenames(filename: str, exc: type[Exception], tmp_path: Path) -> None:
+    """Test for failure on bad or non-existent files."""
+    urlpath = tmp_path / filename
+    with pytest.raises(exc):
+        urlpath_to_sqliteurl(str(urlpath))
 
 
-def test_urlpath_to_sqliteurl() -> None:
+@pytest.mark.parametrize("dirname,exc", BAD_FILES)
+def test_bad_dirnames(dirname: str, exc: type[Exception], tmp_path: Path) -> None:
+    """Test for failure when path points to a directory, not a file."""
+    urlpath = tmp_path / dirname
+    urlpath.mkdir()
+    with pytest.raises(exc):
+        urlpath_to_sqliteurl(str(urlpath))
+
+
+@pytest.mark.parametrize("url,exc", BAD_URLS)
+def test_bad_urls(url: str, exc: type[Exception]) -> None:
+    """Test for failure when we get a bad URL."""
+    with pytest.raises(exc):
+        urlpath_to_sqliteurl(url)
+
+
+def test_local_path_to_sqliteurl() -> None:
     """Test our transformation of paths/URLs into SQL Alchemy URLs."""
-    expected_local_url = "sqlite:///" + str(TEST_DIR / "data/test.db")
-    test_db_path = TEST_DIR / "data/test.db"
+    expected_local_url = f"sqlite:///{DATA_DIR / 'test.db'}"
+    test_db_path = DATA_DIR / "test.db"
     actual_local_url = urlpath_to_sqliteurl(str(test_db_path))
     assert actual_local_url == expected_local_url  # nosec: B101
+
+
+# Note: There's no remote URL unit test for a working input to urlpath_to_sqliteurl()
+# because it's exercised in the integration tests, and there's no way to know what the
+# local path to the cached file will be since it uses a hash (of the URL?) as the
+# filename.
diff --git a/tox.ini b/tox.ini
@@ -5,6 +5,7 @@ envlist = ci
 allowlist_externals =
     bash
     coverage
+    mypy
     sphinx-build
     twine
 # shared directory for re-used packages