Merge pull request #5 from wednesday-solutions/fix/mypy-type

Feat: Test Cases Added
wednesday-solutions · Dec 13, 2023 · 16fbbae · 16fbbae
2 parents bf44e50 + bc8287f
commit 16fbbae
Show file tree

Hide file tree

Showing 16 changed files with 175 additions and 29 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -9,31 +9,37 @@ on:
 jobs:
     run-ci:
         runs-on: ubuntu-latest
-        container:
-          image: dipanshuwed/glue4.0
+        container: dipanshuwed/glue4.0:latest
+
         steps:
             - name: Checkout repository
               uses: actions/checkout@v3
 
             - name: Install dependencies
               run: |
-                pip3 install mypy pylint coverage
-                yum install jq -y # jq is required for sonar
+                python3 -m pip install --upgrade pip
+                pip3 install -r requirements.txt
+                yum install -y jq
 
             - name: Type check
               run: mypy ./ --ignore-missing-imports
 
             - name: Lint
-              run: pylint app/ main.py setup.py  --output pylint-report.txt
+              run: |
+                pylint app tests main.py setup.py --output pylint-report.txt
+                pylint app tests main.py setup.py
 
             - name: Testing
-              run: coverage run --source=app -m unittest discover -s app/tests/
+              run: |
+                export KAGGLE_KEY=MOCKKEY
+                export KAGGLE_USERNAME=MOCKUSERNAME
+                coverage run --source=app -m unittest discover -s tests
 
             - name: Test coverage report
               run: |
                 coverage report
                 coverage xml
-    
+
             - name: SonarQube Scan
               uses: sonarsource/sonarqube-scan-action@master
               with:
@@ -43,7 +49,7 @@ jobs:
                 GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
                 SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
                 SONAR_HOST_URL: ${{ secrets.SONAR_HOST_URL }}
-    
+
             - uses: sonarsource/sonarqube-quality-gate-action@master
               timeout-minutes: 5
               env:

diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 .env
 .coverage
-app/__pycache__
-app/tests/__pycache__
+*__pycache__
+temp
+htmlcov
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -25,7 +25,7 @@ repos:
 
       - id: lint
         name: Linting
-        entry: pylint app/ main.py setup.py
+        entry: pylint app tests main.py setup.py
         language: system
         always_run: true
         types: [python3]

diff --git a/README.md b/README.md
@@ -96,7 +96,7 @@ Refer: [Glue Programming libraries](https://docs.aws.amazon.com/glue/latest/dg/a
 
 To run tests in the root of the directory use:
 
-    coverage run --source=app -m unittest discover -s app/tests/
+    coverage run --source=app -m unittest discover -s tests
     coverage report
 
 Note that awsglue libraries are not availabe to download, so use AWS Glue 4 Docker container.
diff --git a/app/Extraction.py b/app/Extraction.py
@@ -6,7 +6,6 @@
 
 def extract_from_kaggle(flag: bool):
     if flag:
-        print("WRONG BIT RAN!!!!")
         read_path = "/dbfs/mnt/rawdata/"
         write_path = "/mnt/transformed/"
     else:

diff --git a/app/SparkWrapper.py b/app/SparkWrapper.py
@@ -2,7 +2,9 @@
 from pyspark.sql import Window, WindowSpec
 
 
-def create_frame(sc: SparkSession, path: str):
+def create_frame(sc: SparkSession | None, path: str):
+    if sc is None:
+        raise TypeError(f"{sc} is None. Pass Spark Session")
     df = sc.read.csv(path, inferSchema=True, header=True)
     return df
 

diff --git a/main.py b/main.py
@@ -31,21 +31,30 @@
 
 flag = bool(flag == "True")
 
+# if 'spark' in locals():
+#     flag = True
+# else:
+#     spark = None
+#     dbutils = None
+#     flag = False
+
 
 # COMMAND ----------
 
 if flag:
-    os.environ["KAGGLE_USERNAME"] = keys["kaggle_username"]
+    os.environ["KAGGLE_USERNAME"] = dbutils.widgets.get("kaggle_username")
 
-    os.environ["KAGGLE_KEY"] = keys["kaggle_token"]
+    os.environ["KAGGLE_KEY"] = dbutils.widgets.get("kaggle_token")
 
-    os.environ["storage_account_name"] = keys["storage_account_name"]
+    os.environ["storage_account_name"] = dbutils.widgets.get("storage_account_name")
 
-    os.environ["datalake_access_key"] = keys["datalake_access_key"]
+    os.environ["datalake_access_key"] = dbutils.widgets.get("datalake_access_key")
 
 
 # COMMAND ----------
 if flag:
+    import app.connect_databricks as cd
+
     # creating mounts
     cd.create_mount(dbutils, "zipdata", "/mnt/zipdata/")
     cd.create_mount(dbutils, "rawdata", "/mnt/rawdata/")

diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,6 @@
+mypy~=1.7.1
+pylint~=3.0.2
+coverage~=7.3.2
+python-dotenv~=1.0.0
+kaggle~=1.5.16
+pre-commit~=3.6.0
diff --git a/sonar-project.properties b/sonar-project.properties
@@ -8,8 +8,8 @@ sonar.python.coverage.reportPaths=*coverage.xml
 sonar.python.pylint_config=.pylintrc
 sonar.python.pylint=/usr/local/bin/pylint
 sonar.inclusions=**/app/**,**/main.py
-sonar.exclusions=**/app/**/tests/test_*.py,**/__init__.py,**/src/**/*.csv,**/app/tests/mock/*.*
-sonar.test.exclusions=**/app/**/tests/test_*.py,**/__init__.py,**/app/**/*.csv,**/app/tests/mock/*.*
-sonar.coverage.exclusions=**/app/**/tests/test_*.py,**/__init__.py,**/app/**/*.csv,**/app/tests/mock/*.*
+sonar.exclusions=**/tests/test_*.py,**/__init__.py,**/tests/mock/*.*
+sonar.test.exclusions=**/tests/test_*.py,**/__init__.py,**/tests/mock/*.*
+sonar.coverage.exclusions=**/tests/test_*.py,**/__init__.py,**/tests/mock/*.*
 sonar.text.excluded.file.suffixes=csv
 sonar.python.version=3.7
diff --git a/app/tests/__init__.py → tests/__init__.py b/app/tests/__init__.py → tests/__init__.py
diff --git a/app/tests/connect_glue_test.py → tests/connect_glue_test.py b/app/tests/connect_glue_test.py → tests/connect_glue_test.py
diff --git a/app/tests/mock/sample.csv → tests/mock/sample.csv b/app/tests/mock/sample.csv → tests/mock/sample.csv
diff --git a/tests/test_Extraction.py b/tests/test_Extraction.py
@@ -0,0 +1,74 @@
+import unittest
+from unittest.mock import patch
+from app.Extraction import extract_from_kaggle
+
+
+class TestExtraction(unittest.TestCase):
+    @patch("app.Extraction.kaggle")
+    def test_extract_from_kaggle_success(self, mock_kaggle):
+        mock_kaggle_instance = mock_kaggle
+        mock_api_instance = mock_kaggle_instance.KaggleApi.return_value
+        # Mocking authenticate and dataset_download_cli methods
+        mock_api_instance.authenticate.return_value = None
+        mock_api_instance.dataset_download_cli.return_value = None
+
+        # Call the function to test with flag=True for success case
+        result = extract_from_kaggle(True)
+
+        # Assertions
+        mock_kaggle_instance.KaggleApi.assert_called_once()
+        mock_api_instance.authenticate.assert_called_once()
+        mock_api_instance.dataset_download_cli.assert_called_once_with(
+            "mastmustu/insurance-claims-fraud-data",
+            unzip=True,
+            path="/dbfs/mnt/rawdata/",
+        )
+
+        self.assertEqual(result, ("/mnt/rawdata/", "/mnt/transformed/"))
+
+    @patch("app.Extraction.kaggle")
+    def test_extract_from_kaggle_success_false(self, mock_kaggle):
+        mock_kaggle_instance = mock_kaggle
+        mock_api_instance = mock_kaggle_instance.KaggleApi.return_value
+        # Mocking authenticate and dataset_download_cli methods
+        mock_api_instance.authenticate.return_value = None
+        mock_api_instance.dataset_download_cli.return_value = None
+
+        # Call the function to test with flag=True for success case
+        result = extract_from_kaggle(False)
+
+        # Assertions
+        mock_kaggle_instance.KaggleApi.assert_called_once()
+        mock_api_instance.authenticate.assert_called_once()
+        mock_api_instance.dataset_download_cli.assert_called_once_with(
+            "mastmustu/insurance-claims-fraud-data", unzip=True, path="temp/"
+        )
+
+        self.assertEqual(
+            result,
+            (
+                "s3://glue-bucket-vighnesh/rawdata/",
+                "s3://glue-bucket-vighnesh/transformed/",
+            ),
+        )
+
+    @patch("app.Extraction.kaggle")
+    def test_extract_from_kaggle_failure(self, mock_kaggle):
+        mock_kaggle_instance = mock_kaggle
+        mock_api_instance = mock_kaggle_instance.KaggleApi.return_value
+        # Mocking authenticate and dataset_download_cli methods
+        mock_api_instance.authenticate.side_effect = Exception(
+            "Simulated authentication failure"
+        )
+
+        # Call the function to test with flag=False for failure case
+        with self.assertRaises(Exception) as context:
+            extract_from_kaggle(False)
+
+        # Assertions
+        mock_kaggle_instance.KaggleApi.assert_called_once()
+        mock_api_instance.authenticate.assert_called_once()
+        mock_api_instance.dataset_download_cli.assert_not_called()
+
+        # Check if the correct exception is raised
+        self.assertEqual(str(context.exception), "Simulated authentication failure")
diff --git a/app/tests/test_SparkWrapper.py → tests/test_SparkWrapper.py b/app/tests/test_SparkWrapper.py → tests/test_SparkWrapper.py
@@ -9,9 +9,8 @@ def setUp(self) -> None:
         self.spark = (
             SparkSession.builder.master("local").appName("Testing").getOrCreate()
         )
-        self.df = self.spark.read.csv(
-            "app/tests/mock/sample.csv", inferSchema=True, header=True
-        )
+        self.path = "tests/mock/sample.csv"
+        self.df = self.spark.read.csv(self.path, inferSchema=True, header=True)
         super().setUp()
 
     def tearDown(self) -> None:
@@ -43,8 +42,7 @@ def test_rename_columns(self):
         self.assertListEqual(actual_columns, expected_columns)
 
     def test_create_frame(self):
-        path = "app/tests/mock/sample.csv"
-        df = create_frame(self.spark, path).drop("date")
+        df = create_frame(self.spark, self.path).drop("date")
         actual_data = df.collect()
 
         expected_data = [

diff --git a/app/tests/test_SparkWrapperFailure.py → tests/test_SparkWrapperFailure.py b/app/tests/test_SparkWrapperFailure.py → tests/test_SparkWrapperFailure.py
@@ -11,9 +11,8 @@ def setUp(self) -> None:
         self.spark = (
             SparkSession.builder.appName("Testing").master("local[*]").getOrCreate()
         )
-        self.df = self.spark.read.csv(
-            "app/tests/mock/sample.csv", inferSchema=True, header=True
-        )
+        self.path = "tests/mock/sample.csv"
+        self.df = self.spark.read.csv(self.path, inferSchema=True, header=True)
         super().setUp()
 
     def tearDown(self) -> None:

diff --git a/tests/test_connect_glue.py b/tests/test_connect_glue.py
@@ -0,0 +1,52 @@
+import unittest
+from unittest.mock import patch, MagicMock
+from app.connect_glue import init_glue
+
+
+class TestInitGlue(unittest.TestCase):
+    @patch("app.connect_glue.SparkContext")
+    @patch("app.connect_glue.GlueContext")
+    @patch("app.connect_glue.Job")
+    def test_init_glue(self, mock_job, mock_glue_context, mock_spark_context):
+        # Mock the SparkContext, GlueContext, and Job
+        mock_spark_context_instance = MagicMock()
+        mock_glue_context_instance = MagicMock()
+        mock_job_instance = MagicMock()
+
+        # Set up the behavior of the mock instances
+        mock_spark_context.return_value = mock_spark_context_instance
+        mock_glue_context.return_value = mock_glue_context_instance
+        mock_job.return_value = mock_job_instance
+
+        # Call the function to test
+        glue_context, spark, job = init_glue()
+
+        # Assertions
+        mock_spark_context.assert_called_once()
+        mock_glue_context.assert_called_once_with(mock_spark_context_instance)
+        mock_job.assert_called_once_with(mock_glue_context_instance)
+
+        # Check if the returned values are correct
+        self.assertEqual(glue_context, mock_glue_context_instance)
+        self.assertEqual(spark, mock_glue_context_instance.spark_session)
+        self.assertEqual(job, mock_job_instance)
+
+    @patch("app.connect_glue.SparkContext")
+    @patch("app.connect_glue.GlueContext")
+    @patch("app.connect_glue.Job")
+    def test_init_glue_failure(self, mock_job, mock_glue_context, mock_spark_context):
+        # Simulate a ValueError during SparkContext initialization
+        error_statement = "Simulated SparkContext initialization failure"
+        mock_spark_context.side_effect = ValueError(error_statement)
+
+        # Call the function to test
+        with self.assertRaises(ValueError) as context:
+            init_glue()
+
+        # Assertions
+        mock_spark_context.assert_called_once()
+        mock_glue_context.assert_not_called()  # GlueContext should not be called if SparkContext initialization fails
+        mock_job.assert_not_called()  # Job should not be called if SparkContext initialization fails
+
+        # Check if the error displayed correctly
+        self.assertEqual(str(context.exception), error_statement)