Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[IO] Improved IO with support for reading data from compressed files #308

Merged
merged 8 commits into from
Jun 19, 2024
30 changes: 22 additions & 8 deletions kloppy/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,31 @@
import os
import urllib.parse
from dataclasses import dataclass, replace
from pathlib import PurePath
from typing import Union, IO, BinaryIO, Tuple

from io import BytesIO
from pathlib import PurePath
from typing import IO, BinaryIO, Tuple, Union

from kloppy.config import get_config
from kloppy.exceptions import InputNotFoundError
from kloppy.infra.io.adapters import get_adapter


logger = logging.getLogger(__name__)

_open = open

def _open(file: str, mode: str):
if file.endswith(".gz"):
import gzip

return gzip.open(file, mode)
elif file.endswith(".xz"):
import lzma

return lzma.open(file, mode)
elif file.endswith(".bz2"):
import bz2

return bz2.open(file, mode)
return open(file, mode)


@dataclass(frozen=True)
Expand All @@ -35,10 +47,12 @@ def create(cls, input_: "FileLike", **kwargs):


def get_file_extension(f: FileLike) -> str:
if isinstance(f, str):
if isinstance(f, PurePath) or isinstance(f, str):
f = str(f)
for ext in [".gz", ".xz", ".bz2"]:
probberechts marked this conversation as resolved.
Show resolved Hide resolved
if f.endswith(ext):
f = f[: -len(ext)]
return os.path.splitext(f)[1]
elif isinstance(f, PurePath):
return os.path.splitext(f.name)[1]
elif isinstance(f, Source):
return get_file_extension(f.data)
else:
Expand Down
37 changes: 11 additions & 26 deletions kloppy/tests/test_helpers.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,31 @@
import os
import sys
from pathlib import Path

import pytest

from kloppy.config import config_context
from pandas import DataFrame
from pandas.testing import assert_frame_equal


from kloppy import opta, statsbomb, tracab
from kloppy.config import config_context
from kloppy.domain import (
Period,
DatasetFlag,
Point,
AttackingDirection,
TrackingDataset,
NormalizedPitchDimensions,
DatasetFlag,
Dimension,
Orientation,
Provider,
Frame,
Ground,
Metadata,
MetricaCoordinateSystem,
Team,
Ground,
NormalizedPitchDimensions,
Orientation,
Period,
Player,
PlayerData,
Point,
Point3D,
Provider,
Team,
TrackingDataset,
)

from kloppy import opta, tracab, statsbomb
from kloppy.io import open_as_file


class TestHelpers:
def _get_tracking_dataset(self):
Expand Down Expand Up @@ -517,12 +511,3 @@ def test_to_df_pyarrow(self):
df = dataset.to_df(engine="pandas[pyarrow]")
assert isinstance(df, pd.DataFrame)
assert isinstance(df.dtypes["ball_x"], pd.ArrowDtype)


class TestOpenAsFile:
def test_path(self):
path = Path(__file__).parent / "files/tracab_meta.xml"
with open_as_file(path) as fp:
data = fp.read()

assert len(data) == os.path.getsize(path)
88 changes: 88 additions & 0 deletions kloppy/tests/test_io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import os
from pathlib import Path

from kloppy.io import open_as_file, get_file_extension


class TestOpenAsFile:
"""Tests for the open_as_file function."""

def test_bytes(self, base_dir: Path):
"""It should be able to open a file from a bytes object."""
path = base_dir / "files" / "tracab_meta.xml"
with open(path, "rb") as f:
data = f.read()

with open_as_file(data) as fp:
assert fp.read() == data

def test_str(self, base_dir: Path):
"""It should be able to open a file from a string object."""
path = str(base_dir / "files" / "tracab_meta.xml")
with open_as_file(path) as fp:
data = fp.read()

assert len(data) == os.path.getsize(path)

def test_path(self, base_dir: Path):
"""It should be able to open a file from a Path object."""
path = base_dir / "files" / "tracab_meta.xml"
with open_as_file(path) as fp:
data = fp.read()

assert len(data) == os.path.getsize(path)

def test_gzip(self, base_dir: Path, tmp_path: Path):
"""It should be able to open a gzipped file."""
raw_path = base_dir / "files" / "tracab_meta.xml"
gz_path = tmp_path / "tracab_meta.xml.gz"
# Create a gzipped file
import gzip

with open(raw_path, "rb") as f:
with gzip.open(gz_path, "wb") as f_out:
f_out.write(f.read())
# Read the gzipped file
with open_as_file(raw_path) as fp:
data = fp.read()

assert len(data) == os.path.getsize(raw_path)

def test_xz(self, base_dir: Path, tmp_path: Path):
"""It should be able to open a LZMA-compressed file."""
raw_path = base_dir / "files" / "tracab_meta.xml"
gz_path = tmp_path / "tracab_meta.xml.gz"
# Create a LMZA-compressed file
import lzma

with open(raw_path, "rb") as f:
with lzma.open(gz_path, "wb") as f_out:
f_out.write(f.read())
# Read the gzipped file
with open_as_file(raw_path) as fp:
data = fp.read()

assert len(data) == os.path.getsize(raw_path)

def test_bz2(self, base_dir: Path, tmp_path: Path):
"""It should be able to open a bzip2-compressed file."""
raw_path = base_dir / "files" / "tracab_meta.xml"
gz_path = tmp_path / "tracab_meta.xml.gz"
# Create a bz2-compressed file
import bz2

with open(raw_path, "rb") as f:
with bz2.open(gz_path, "wb") as f_out:
f_out.write(f.read())
# Read the gzipped file
with open_as_file(raw_path) as fp:
data = fp.read()

assert len(data) == os.path.getsize(raw_path)


def test_get_file_extension():
assert get_file_extension(Path("data.xml")) == ".xml"
assert get_file_extension("data.xml") == ".xml"
assert get_file_extension("data.xml.gz") == ".xml"
assert get_file_extension("data") == ""