diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index eeb09a0..6793413 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -9,31 +9,37 @@ on: jobs: run-ci: runs-on: ubuntu-latest - container: - image: dipanshuwed/glue4.0 + container: dipanshuwed/glue4.0:latest + steps: - name: Checkout repository uses: actions/checkout@v3 - name: Install dependencies run: | - pip3 install mypy pylint coverage - yum install jq -y # jq is required for sonar + python3 -m pip install --upgrade pip + pip3 install -r requirements.txt + yum install -y jq - name: Type check run: mypy ./ --ignore-missing-imports - name: Lint - run: pylint app/ main.py setup.py --output pylint-report.txt + run: | + pylint app tests main.py setup.py --output pylint-report.txt + pylint app tests main.py setup.py - name: Testing - run: coverage run --source=app -m unittest discover -s app/tests/ + run: | + export KAGGLE_KEY=MOCKKEY + export KAGGLE_USERNAME=MOCKUSERNAME + coverage run --source=app -m unittest discover -s tests - name: Test coverage report run: | coverage report coverage xml - + - name: SonarQube Scan uses: sonarsource/sonarqube-scan-action@master with: @@ -43,7 +49,7 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }} SONAR_HOST_URL: ${{ secrets.SONAR_HOST_URL }} - + - uses: sonarsource/sonarqube-quality-gate-action@master timeout-minutes: 5 env: diff --git a/.gitignore b/.gitignore index 0fff62c..5f4ce98 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .env .coverage -app/__pycache__ -app/tests/__pycache__ +*__pycache__ +temp +htmlcov diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4b60116..cfce02d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,7 +25,7 @@ repos: - id: lint name: Linting - entry: pylint app/ main.py setup.py + entry: pylint app tests main.py setup.py language: system always_run: true types: [python3] diff --git a/README.md b/README.md index bbfceff..dd3adb2 100644 --- a/README.md +++ b/README.md @@ -96,7 +96,7 @@ Refer: [Glue Programming libraries](https://docs.aws.amazon.com/glue/latest/dg/a To run tests in the root of the directory use: - coverage run --source=app -m unittest discover -s app/tests/ + coverage run --source=app -m unittest discover -s tests coverage report Note that awsglue libraries are not availabe to download, so use AWS Glue 4 Docker container. diff --git a/app/Extraction.py b/app/Extraction.py index c78bad8..13bfaf6 100644 --- a/app/Extraction.py +++ b/app/Extraction.py @@ -6,7 +6,6 @@ def extract_from_kaggle(flag: bool): if flag: - print("WRONG BIT RAN!!!!") read_path = "/dbfs/mnt/rawdata/" write_path = "/mnt/transformed/" else: diff --git a/app/SparkWrapper.py b/app/SparkWrapper.py index 2a0d98c..4816291 100644 --- a/app/SparkWrapper.py +++ b/app/SparkWrapper.py @@ -2,7 +2,9 @@ from pyspark.sql import Window, WindowSpec -def create_frame(sc: SparkSession, path: str): +def create_frame(sc: SparkSession | None, path: str): + if sc is None: + raise TypeError(f"{sc} is None. Pass Spark Session") df = sc.read.csv(path, inferSchema=True, header=True) return df diff --git a/main.py b/main.py index b730912..0493b56 100644 --- a/main.py +++ b/main.py @@ -31,21 +31,30 @@ flag = bool(flag == "True") +# if 'spark' in locals(): +# flag = True +# else: +# spark = None +# dbutils = None +# flag = False + # COMMAND ---------- if flag: - os.environ["KAGGLE_USERNAME"] = keys["kaggle_username"] + os.environ["KAGGLE_USERNAME"] = dbutils.widgets.get("kaggle_username") - os.environ["KAGGLE_KEY"] = keys["kaggle_token"] + os.environ["KAGGLE_KEY"] = dbutils.widgets.get("kaggle_token") - os.environ["storage_account_name"] = keys["storage_account_name"] + os.environ["storage_account_name"] = dbutils.widgets.get("storage_account_name") - os.environ["datalake_access_key"] = keys["datalake_access_key"] + os.environ["datalake_access_key"] = dbutils.widgets.get("datalake_access_key") # COMMAND ---------- if flag: + import app.connect_databricks as cd + # creating mounts cd.create_mount(dbutils, "zipdata", "/mnt/zipdata/") cd.create_mount(dbutils, "rawdata", "/mnt/rawdata/") diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d7770a0 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +mypy~=1.7.1 +pylint~=3.0.2 +coverage~=7.3.2 +python-dotenv~=1.0.0 +kaggle~=1.5.16 +pre-commit~=3.6.0 diff --git a/sonar-project.properties b/sonar-project.properties index 45c55c7..6818bb0 100644 --- a/sonar-project.properties +++ b/sonar-project.properties @@ -8,8 +8,8 @@ sonar.python.coverage.reportPaths=*coverage.xml sonar.python.pylint_config=.pylintrc sonar.python.pylint=/usr/local/bin/pylint sonar.inclusions=**/app/**,**/main.py -sonar.exclusions=**/app/**/tests/test_*.py,**/__init__.py,**/src/**/*.csv,**/app/tests/mock/*.* -sonar.test.exclusions=**/app/**/tests/test_*.py,**/__init__.py,**/app/**/*.csv,**/app/tests/mock/*.* -sonar.coverage.exclusions=**/app/**/tests/test_*.py,**/__init__.py,**/app/**/*.csv,**/app/tests/mock/*.* +sonar.exclusions=**/tests/test_*.py,**/__init__.py,**/tests/mock/*.* +sonar.test.exclusions=**/tests/test_*.py,**/__init__.py,**/tests/mock/*.* +sonar.coverage.exclusions=**/tests/test_*.py,**/__init__.py,**/tests/mock/*.* sonar.text.excluded.file.suffixes=csv sonar.python.version=3.7 diff --git a/app/tests/__init__.py b/tests/__init__.py similarity index 100% rename from app/tests/__init__.py rename to tests/__init__.py diff --git a/app/tests/connect_glue_test.py b/tests/connect_glue_test.py similarity index 100% rename from app/tests/connect_glue_test.py rename to tests/connect_glue_test.py diff --git a/app/tests/mock/sample.csv b/tests/mock/sample.csv similarity index 100% rename from app/tests/mock/sample.csv rename to tests/mock/sample.csv diff --git a/tests/test_Extraction.py b/tests/test_Extraction.py new file mode 100644 index 0000000..601d30d --- /dev/null +++ b/tests/test_Extraction.py @@ -0,0 +1,74 @@ +import unittest +from unittest.mock import patch +from app.Extraction import extract_from_kaggle + + +class TestExtraction(unittest.TestCase): + @patch("app.Extraction.kaggle") + def test_extract_from_kaggle_success(self, mock_kaggle): + mock_kaggle_instance = mock_kaggle + mock_api_instance = mock_kaggle_instance.KaggleApi.return_value + # Mocking authenticate and dataset_download_cli methods + mock_api_instance.authenticate.return_value = None + mock_api_instance.dataset_download_cli.return_value = None + + # Call the function to test with flag=True for success case + result = extract_from_kaggle(True) + + # Assertions + mock_kaggle_instance.KaggleApi.assert_called_once() + mock_api_instance.authenticate.assert_called_once() + mock_api_instance.dataset_download_cli.assert_called_once_with( + "mastmustu/insurance-claims-fraud-data", + unzip=True, + path="/dbfs/mnt/rawdata/", + ) + + self.assertEqual(result, ("/mnt/rawdata/", "/mnt/transformed/")) + + @patch("app.Extraction.kaggle") + def test_extract_from_kaggle_success_false(self, mock_kaggle): + mock_kaggle_instance = mock_kaggle + mock_api_instance = mock_kaggle_instance.KaggleApi.return_value + # Mocking authenticate and dataset_download_cli methods + mock_api_instance.authenticate.return_value = None + mock_api_instance.dataset_download_cli.return_value = None + + # Call the function to test with flag=True for success case + result = extract_from_kaggle(False) + + # Assertions + mock_kaggle_instance.KaggleApi.assert_called_once() + mock_api_instance.authenticate.assert_called_once() + mock_api_instance.dataset_download_cli.assert_called_once_with( + "mastmustu/insurance-claims-fraud-data", unzip=True, path="temp/" + ) + + self.assertEqual( + result, + ( + "s3://glue-bucket-vighnesh/rawdata/", + "s3://glue-bucket-vighnesh/transformed/", + ), + ) + + @patch("app.Extraction.kaggle") + def test_extract_from_kaggle_failure(self, mock_kaggle): + mock_kaggle_instance = mock_kaggle + mock_api_instance = mock_kaggle_instance.KaggleApi.return_value + # Mocking authenticate and dataset_download_cli methods + mock_api_instance.authenticate.side_effect = Exception( + "Simulated authentication failure" + ) + + # Call the function to test with flag=False for failure case + with self.assertRaises(Exception) as context: + extract_from_kaggle(False) + + # Assertions + mock_kaggle_instance.KaggleApi.assert_called_once() + mock_api_instance.authenticate.assert_called_once() + mock_api_instance.dataset_download_cli.assert_not_called() + + # Check if the correct exception is raised + self.assertEqual(str(context.exception), "Simulated authentication failure") diff --git a/app/tests/test_SparkWrapper.py b/tests/test_SparkWrapper.py similarity index 95% rename from app/tests/test_SparkWrapper.py rename to tests/test_SparkWrapper.py index eb2fe87..a907bf3 100644 --- a/app/tests/test_SparkWrapper.py +++ b/tests/test_SparkWrapper.py @@ -9,9 +9,8 @@ def setUp(self) -> None: self.spark = ( SparkSession.builder.master("local").appName("Testing").getOrCreate() ) - self.df = self.spark.read.csv( - "app/tests/mock/sample.csv", inferSchema=True, header=True - ) + self.path = "tests/mock/sample.csv" + self.df = self.spark.read.csv(self.path, inferSchema=True, header=True) super().setUp() def tearDown(self) -> None: @@ -43,8 +42,7 @@ def test_rename_columns(self): self.assertListEqual(actual_columns, expected_columns) def test_create_frame(self): - path = "app/tests/mock/sample.csv" - df = create_frame(self.spark, path).drop("date") + df = create_frame(self.spark, self.path).drop("date") actual_data = df.collect() expected_data = [ diff --git a/app/tests/test_SparkWrapperFailure.py b/tests/test_SparkWrapperFailure.py similarity index 95% rename from app/tests/test_SparkWrapperFailure.py rename to tests/test_SparkWrapperFailure.py index e2c453e..7425b42 100644 --- a/app/tests/test_SparkWrapperFailure.py +++ b/tests/test_SparkWrapperFailure.py @@ -11,9 +11,8 @@ def setUp(self) -> None: self.spark = ( SparkSession.builder.appName("Testing").master("local[*]").getOrCreate() ) - self.df = self.spark.read.csv( - "app/tests/mock/sample.csv", inferSchema=True, header=True - ) + self.path = "tests/mock/sample.csv" + self.df = self.spark.read.csv(self.path, inferSchema=True, header=True) super().setUp() def tearDown(self) -> None: diff --git a/tests/test_connect_glue.py b/tests/test_connect_glue.py new file mode 100644 index 0000000..16a837a --- /dev/null +++ b/tests/test_connect_glue.py @@ -0,0 +1,52 @@ +import unittest +from unittest.mock import patch, MagicMock +from app.connect_glue import init_glue + + +class TestInitGlue(unittest.TestCase): + @patch("app.connect_glue.SparkContext") + @patch("app.connect_glue.GlueContext") + @patch("app.connect_glue.Job") + def test_init_glue(self, mock_job, mock_glue_context, mock_spark_context): + # Mock the SparkContext, GlueContext, and Job + mock_spark_context_instance = MagicMock() + mock_glue_context_instance = MagicMock() + mock_job_instance = MagicMock() + + # Set up the behavior of the mock instances + mock_spark_context.return_value = mock_spark_context_instance + mock_glue_context.return_value = mock_glue_context_instance + mock_job.return_value = mock_job_instance + + # Call the function to test + glue_context, spark, job = init_glue() + + # Assertions + mock_spark_context.assert_called_once() + mock_glue_context.assert_called_once_with(mock_spark_context_instance) + mock_job.assert_called_once_with(mock_glue_context_instance) + + # Check if the returned values are correct + self.assertEqual(glue_context, mock_glue_context_instance) + self.assertEqual(spark, mock_glue_context_instance.spark_session) + self.assertEqual(job, mock_job_instance) + + @patch("app.connect_glue.SparkContext") + @patch("app.connect_glue.GlueContext") + @patch("app.connect_glue.Job") + def test_init_glue_failure(self, mock_job, mock_glue_context, mock_spark_context): + # Simulate a ValueError during SparkContext initialization + error_statement = "Simulated SparkContext initialization failure" + mock_spark_context.side_effect = ValueError(error_statement) + + # Call the function to test + with self.assertRaises(ValueError) as context: + init_glue() + + # Assertions + mock_spark_context.assert_called_once() + mock_glue_context.assert_not_called() # GlueContext should not be called if SparkContext initialization fails + mock_job.assert_not_called() # Job should not be called if SparkContext initialization fails + + # Check if the error displayed correctly + self.assertEqual(str(context.exception), error_statement)