From 0f90993da94b433d114282cf11d0c735f4fca1e3 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Fri, 8 Dec 2023 14:59:48 +0530 Subject: [PATCH 01/50] fix: made tests different --- {app/tests => tests}/__init__.py | 0 {app/tests => tests}/connect_glue_test.py | 0 {app/tests => tests}/mock/sample.csv | 0 {app/tests => tests}/test_SparkWrapper.py | 0 {app/tests => tests}/test_SparkWrapperFailure.py | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename {app/tests => tests}/__init__.py (100%) rename {app/tests => tests}/connect_glue_test.py (100%) rename {app/tests => tests}/mock/sample.csv (100%) rename {app/tests => tests}/test_SparkWrapper.py (100%) rename {app/tests => tests}/test_SparkWrapperFailure.py (100%) diff --git a/app/tests/__init__.py b/tests/__init__.py similarity index 100% rename from app/tests/__init__.py rename to tests/__init__.py diff --git a/app/tests/connect_glue_test.py b/tests/connect_glue_test.py similarity index 100% rename from app/tests/connect_glue_test.py rename to tests/connect_glue_test.py diff --git a/app/tests/mock/sample.csv b/tests/mock/sample.csv similarity index 100% rename from app/tests/mock/sample.csv rename to tests/mock/sample.csv diff --git a/app/tests/test_SparkWrapper.py b/tests/test_SparkWrapper.py similarity index 100% rename from app/tests/test_SparkWrapper.py rename to tests/test_SparkWrapper.py diff --git a/app/tests/test_SparkWrapperFailure.py b/tests/test_SparkWrapperFailure.py similarity index 100% rename from app/tests/test_SparkWrapperFailure.py rename to tests/test_SparkWrapperFailure.py From ce3b5bcfd947d1a4b52ddbd879064ffd74eeee00 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Fri, 8 Dec 2023 15:28:12 +0530 Subject: [PATCH 02/50] fix: added new filename --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 0fff62c..887d533 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ .coverage app/__pycache__ app/tests/__pycache__ +tests/__pycache__ From 748fb181f52408445da54752001ac4817099a4b6 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Fri, 8 Dec 2023 15:28:53 +0530 Subject: [PATCH 03/50] fix: updated testing line --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bbfceff..dd3adb2 100644 --- a/README.md +++ b/README.md @@ -96,7 +96,7 @@ Refer: [Glue Programming libraries](https://docs.aws.amazon.com/glue/latest/dg/a To run tests in the root of the directory use: - coverage run --source=app -m unittest discover -s app/tests/ + coverage run --source=app -m unittest discover -s tests coverage report Note that awsglue libraries are not availabe to download, so use AWS Glue 4 Docker container. From c95dab5c6084d3baa34409ac12d2a7c5f828c928 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Fri, 8 Dec 2023 15:29:35 +0530 Subject: [PATCH 04/50] fix: updated testing line --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0a41085..4b1d241 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,7 +29,7 @@ jobs: run: pylint app/ main.py setup.py --output pylint-report.txt - name: Testing - run: coverage run --source=app -m unittest discover -s app/tests/ + run: coverage run --source=app -m unittest discover -s tests - name: Test coverage report run: coverage xml From a312f7eaac24c722214bbd7f396d32382de8179f Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Fri, 8 Dec 2023 15:44:58 +0530 Subject: [PATCH 05/50] fix: added path to the mock data --- tests/test_SparkWrapper.py | 6 +++--- tests/test_SparkWrapperFailure.py | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/test_SparkWrapper.py b/tests/test_SparkWrapper.py index eb2fe87..bc77b79 100644 --- a/tests/test_SparkWrapper.py +++ b/tests/test_SparkWrapper.py @@ -9,8 +9,9 @@ def setUp(self) -> None: self.spark = ( SparkSession.builder.master("local").appName("Testing").getOrCreate() ) + self.path = "tests/mock/sample.csv" self.df = self.spark.read.csv( - "app/tests/mock/sample.csv", inferSchema=True, header=True + self.path, inferSchema=True, header=True ) super().setUp() @@ -43,8 +44,7 @@ def test_rename_columns(self): self.assertListEqual(actual_columns, expected_columns) def test_create_frame(self): - path = "app/tests/mock/sample.csv" - df = create_frame(self.spark, path).drop("date") + df = create_frame(self.spark, self.path).drop("date") actual_data = df.collect() expected_data = [ diff --git a/tests/test_SparkWrapperFailure.py b/tests/test_SparkWrapperFailure.py index e2c453e..f1df05c 100644 --- a/tests/test_SparkWrapperFailure.py +++ b/tests/test_SparkWrapperFailure.py @@ -11,8 +11,9 @@ def setUp(self) -> None: self.spark = ( SparkSession.builder.appName("Testing").master("local[*]").getOrCreate() ) + self.path = "tests/mock/sample.csv" self.df = self.spark.read.csv( - "app/tests/mock/sample.csv", inferSchema=True, header=True + self.path, inferSchema=True, header=True ) super().setUp() From df552c76a0d79cb2149a9db3a7f1dadae3ad418b Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Fri, 8 Dec 2023 15:52:00 +0530 Subject: [PATCH 06/50] fix: added report & xml --- .github/workflows/ci.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4b1d241..3f13eae 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,13 +26,15 @@ jobs: run: mypy ./ --ignore-missing-imports - name: Lint - run: pylint app/ main.py setup.py --output pylint-report.txt + run: pylint app tests main.py setup.py --output pylint-report.txt - name: Testing run: coverage run --source=app -m unittest discover -s tests - name: Test coverage report - run: coverage xml + run: | + coverage report + coverage xml - name: SonarQube Scan uses: sonarsource/sonarqube-scan-action@master From 78c0b319d3d9a052dbc98e07c18f0a446fa84916 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Fri, 8 Dec 2023 15:52:39 +0530 Subject: [PATCH 07/50] fix: added testing in lint --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4b60116..cfce02d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,7 +25,7 @@ repos: - id: lint name: Linting - entry: pylint app/ main.py setup.py + entry: pylint app tests main.py setup.py language: system always_run: true types: [python3] From d39faa78ea9500f3409818769d91092eb9a61b3b Mon Sep 17 00:00:00 2001 From: Vighnesh <149361506+vighnesh-wednesday@users.noreply.github.com> Date: Fri, 8 Dec 2023 16:13:44 +0530 Subject: [PATCH 08/50] Update sonar-project.properties --- sonar-project.properties | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sonar-project.properties b/sonar-project.properties index f772ff1..45c55c7 100644 --- a/sonar-project.properties +++ b/sonar-project.properties @@ -1,4 +1,4 @@ -sonar.projectKey=wednesday-solutions_multi-cloud-etl-pipeline_AYxH5ljnvUodY4XtbCsH +sonar.projectKey=wednesday-solutions_multi-cloud-etl-pipeline_AYxJBPlzB2n8RRmGoUBz sonar.projectName=Multi Cloud ETL Pipeline sonar.sources=. sonar.language=py From 5c0db3ca8590312026dfaa1fe3e877ae6deca4ca Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Fri, 8 Dec 2023 16:23:02 +0530 Subject: [PATCH 09/50] fix: fixed exclusions in sonar --- sonar-project.properties | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sonar-project.properties b/sonar-project.properties index f772ff1..57e604e 100644 --- a/sonar-project.properties +++ b/sonar-project.properties @@ -8,8 +8,8 @@ sonar.python.coverage.reportPaths=*coverage.xml sonar.python.pylint_config=.pylintrc sonar.python.pylint=/usr/local/bin/pylint sonar.inclusions=**/app/**,**/main.py -sonar.exclusions=**/app/**/tests/test_*.py,**/__init__.py,**/src/**/*.csv,**/app/tests/mock/*.* -sonar.test.exclusions=**/app/**/tests/test_*.py,**/__init__.py,**/app/**/*.csv,**/app/tests/mock/*.* -sonar.coverage.exclusions=**/app/**/tests/test_*.py,**/__init__.py,**/app/**/*.csv,**/app/tests/mock/*.* +sonar.exclusions=**/tests/test_*.py,**/tests/mock/*.*,**/__init__.py, +sonar.test.exclusions=**/tests/test_*.py,**/tests/mock/*.*,**/__init__.py, +sonar.coverage.exclusions=**/tests/test_*.py,**/__init__.py,**/app/**/*.csv,**/app/tests/mock/*.* sonar.text.excluded.file.suffixes=csv sonar.python.version=3.7 From 6f82e9b881244bdc1dbeb33e912447eb05e8952c Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Fri, 8 Dec 2023 16:30:08 +0530 Subject: [PATCH 10/50] fix: changed exclusions --- sonar-project.properties | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sonar-project.properties b/sonar-project.properties index 1a56f84..e05d9e7 100644 --- a/sonar-project.properties +++ b/sonar-project.properties @@ -8,8 +8,8 @@ sonar.python.coverage.reportPaths=*coverage.xml sonar.python.pylint_config=.pylintrc sonar.python.pylint=/usr/local/bin/pylint sonar.inclusions=**/app/**,**/main.py -sonar.exclusions=**/tests/test_*.py,**/tests/mock/*.*,**/__init__.py, -sonar.test.exclusions=**/tests/test_*.py,**/tests/mock/*.*,**/__init__.py, -sonar.coverage.exclusions=**/tests/test_*.py,**/__init__.py,**/app/**/*.csv,**/app/tests/mock/*.* +sonar.exclusions=**/tests/test_*.py,**/tests/mock/*.*,**/__init__.py +sonar.test.exclusions=**/tests/test_*.py,**/tests/mock/*.*,**/__init__.py +sonar.coverage.exclusions=**/tests/test_*.py,**/tests/mock/*.*,**/__init__.py sonar.text.excluded.file.suffixes=csv sonar.python.version=3.7 From 4cab5182f651f51da137aa4ccb311ae9f0c42610 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Fri, 8 Dec 2023 16:42:17 +0530 Subject: [PATCH 11/50] fix: changed order --- sonar-project.properties | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sonar-project.properties b/sonar-project.properties index e05d9e7..6818bb0 100644 --- a/sonar-project.properties +++ b/sonar-project.properties @@ -8,8 +8,8 @@ sonar.python.coverage.reportPaths=*coverage.xml sonar.python.pylint_config=.pylintrc sonar.python.pylint=/usr/local/bin/pylint sonar.inclusions=**/app/**,**/main.py -sonar.exclusions=**/tests/test_*.py,**/tests/mock/*.*,**/__init__.py -sonar.test.exclusions=**/tests/test_*.py,**/tests/mock/*.*,**/__init__.py -sonar.coverage.exclusions=**/tests/test_*.py,**/tests/mock/*.*,**/__init__.py +sonar.exclusions=**/tests/test_*.py,**/__init__.py,**/tests/mock/*.* +sonar.test.exclusions=**/tests/test_*.py,**/__init__.py,**/tests/mock/*.* +sonar.coverage.exclusions=**/tests/test_*.py,**/__init__.py,**/tests/mock/*.* sonar.text.excluded.file.suffixes=csv sonar.python.version=3.7 From 3897ac141911f01ad10bf5ef08a97d212612c2bb Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Sun, 10 Dec 2023 16:42:04 +0530 Subject: [PATCH 12/50] added another file path --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 887d533..51ea74a 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ app/__pycache__ app/tests/__pycache__ tests/__pycache__ +temp From 3d26df615195a0414ce7641fb293ea055e88eaaa Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Sun, 10 Dec 2023 18:18:35 +0530 Subject: [PATCH 13/50] fix: added whitespace --- .github/workflows/ci.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3f13eae..f471628 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -9,6 +9,9 @@ on: jobs: run-ci: runs-on: ubuntu-latest + container: + image: amazon/aws-glue-libs:glue_libs_4.0.0_image_01 + options: --user 1001 # Set the user ID to 1001 steps: - name: Checkout repository uses: actions/checkout@v3 @@ -20,7 +23,7 @@ jobs: - name: Install dependencies run: | - pip install mypy==1.7.1 pylint==3.0.2 pyspark==3.3.0 coverage + pip install -r requirements.txt - name: Type check run: mypy ./ --ignore-missing-imports @@ -35,7 +38,7 @@ jobs: run: | coverage report coverage xml - + - name: SonarQube Scan uses: sonarsource/sonarqube-scan-action@master with: @@ -45,7 +48,7 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }} SONAR_HOST_URL: ${{ secrets.SONAR_HOST_URL }} - + - uses: sonarsource/sonarqube-quality-gate-action@master timeout-minutes: 5 env: From f3d7402cad49faea9bbddaef79ac93d3a01ec231 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Sun, 10 Dec 2023 18:19:43 +0530 Subject: [PATCH 14/50] added requirements.txt --- requirements.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..aa3b92d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +mypy==1.7.1 +pylint==3.0.2 +coverage==7.3.2 From cd19e43a3115e157edd0f26ba7732e166ca50e7d Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Sun, 10 Dec 2023 18:20:25 +0530 Subject: [PATCH 15/50] added connect glue test --- tests/test_connect_glue.py | 52 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 tests/test_connect_glue.py diff --git a/tests/test_connect_glue.py b/tests/test_connect_glue.py new file mode 100644 index 0000000..16a837a --- /dev/null +++ b/tests/test_connect_glue.py @@ -0,0 +1,52 @@ +import unittest +from unittest.mock import patch, MagicMock +from app.connect_glue import init_glue + + +class TestInitGlue(unittest.TestCase): + @patch("app.connect_glue.SparkContext") + @patch("app.connect_glue.GlueContext") + @patch("app.connect_glue.Job") + def test_init_glue(self, mock_job, mock_glue_context, mock_spark_context): + # Mock the SparkContext, GlueContext, and Job + mock_spark_context_instance = MagicMock() + mock_glue_context_instance = MagicMock() + mock_job_instance = MagicMock() + + # Set up the behavior of the mock instances + mock_spark_context.return_value = mock_spark_context_instance + mock_glue_context.return_value = mock_glue_context_instance + mock_job.return_value = mock_job_instance + + # Call the function to test + glue_context, spark, job = init_glue() + + # Assertions + mock_spark_context.assert_called_once() + mock_glue_context.assert_called_once_with(mock_spark_context_instance) + mock_job.assert_called_once_with(mock_glue_context_instance) + + # Check if the returned values are correct + self.assertEqual(glue_context, mock_glue_context_instance) + self.assertEqual(spark, mock_glue_context_instance.spark_session) + self.assertEqual(job, mock_job_instance) + + @patch("app.connect_glue.SparkContext") + @patch("app.connect_glue.GlueContext") + @patch("app.connect_glue.Job") + def test_init_glue_failure(self, mock_job, mock_glue_context, mock_spark_context): + # Simulate a ValueError during SparkContext initialization + error_statement = "Simulated SparkContext initialization failure" + mock_spark_context.side_effect = ValueError(error_statement) + + # Call the function to test + with self.assertRaises(ValueError) as context: + init_glue() + + # Assertions + mock_spark_context.assert_called_once() + mock_glue_context.assert_not_called() # GlueContext should not be called if SparkContext initialization fails + mock_job.assert_not_called() # Job should not be called if SparkContext initialization fails + + # Check if the error displayed correctly + self.assertEqual(str(context.exception), error_statement) From 1d7bf9a9a458cee40b7b827cba00abf293c5b8a9 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Sun, 10 Dec 2023 18:27:58 +0530 Subject: [PATCH 16/50] commented setup python --- .github/workflows/ci.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f471628..379709c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -16,10 +16,10 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: 3.10.12 + # - name: Set up Python + # uses: actions/setup-python@v2 + # with: + # python-version: 3.10.12 - name: Install dependencies run: | From da005e9a6ad4287ea64cd08c340b2325981146cc Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Sun, 10 Dec 2023 18:32:15 +0530 Subject: [PATCH 17/50] changed python verison --- .github/workflows/ci.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 379709c..266431a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -16,10 +16,10 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 - # - name: Set up Python - # uses: actions/setup-python@v2 - # with: - # python-version: 3.10.12 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: 3.10.1 - name: Install dependencies run: | From 137368c23c5d51b3ef98e67904c3fae319854626 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Sun, 10 Dec 2023 18:41:31 +0530 Subject: [PATCH 18/50] added quotes --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 266431a..6924f7f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -19,7 +19,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: 3.10.1 + python-version: "3.10" - name: Install dependencies run: | From 3b51288c9dd9a9c515304bd58ba923c6392947aa Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Sun, 10 Dec 2023 18:49:31 +0530 Subject: [PATCH 19/50] added logs --- .github/workflows/ci.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6924f7f..16365e1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,6 +21,12 @@ jobs: with: python-version: "3.10" + - name: Check current directory & files in it + run: | + whoami + pwd + ls -a + - name: Install dependencies run: | pip install -r requirements.txt From f32d3036b0ac163815026c18533541c0fdc22d89 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Mon, 11 Dec 2023 12:04:21 +0530 Subject: [PATCH 20/50] changed user to root --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 16365e1..957d36f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest container: image: amazon/aws-glue-libs:glue_libs_4.0.0_image_01 - options: --user 1001 # Set the user ID to 1001 + options: --user root # Set the user ID to root steps: - name: Checkout repository uses: actions/checkout@v3 From e2e440c98e3ccb22d1050afd427136d0c49ff667 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Mon, 11 Dec 2023 12:17:59 +0530 Subject: [PATCH 21/50] added directory --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 957d36f..4bbac2b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,7 +29,7 @@ jobs: - name: Install dependencies run: | - pip install -r requirements.txt + pip install -r ./requirements.txt - name: Type check run: mypy ./ --ignore-missing-imports From a2f08f7d27bf97a0e24a2b1932591e35b4a458ee Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Mon, 11 Dec 2023 12:32:46 +0530 Subject: [PATCH 22/50] logs --- .github/workflows/ci.yml | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4bbac2b..463dd67 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,8 +28,26 @@ jobs: ls -a - name: Install dependencies + if: always() run: | - pip install -r ./requirements.txt + pip install --user root -r ./requirements.txt + + - name: Install dependencies + if: always() + run: | + pip install coverage==7.3.2 + + - name: Install dependencies + if: always() + run: | + ls -a + pip install --user root coverage==7.3.2 + + - name: Install dependencies + if: always() + run: | + ls -a + pip install --user coverage==7.3.2 - name: Type check run: mypy ./ --ignore-missing-imports From 61dcf6b7355c26886e37797ad025873f2636ae2f Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Mon, 11 Dec 2023 14:21:03 +0530 Subject: [PATCH 23/50] added dipanshu's image --- .github/workflows/ci.yml | 29 ++--------------------------- 1 file changed, 2 insertions(+), 27 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 463dd67..ce44827 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -9,9 +9,7 @@ on: jobs: run-ci: runs-on: ubuntu-latest - container: - image: amazon/aws-glue-libs:glue_libs_4.0.0_image_01 - options: --user root # Set the user ID to root + container: dipanshuwed/glue3.0 steps: - name: Checkout repository uses: actions/checkout@v3 @@ -21,33 +19,10 @@ jobs: with: python-version: "3.10" - - name: Check current directory & files in it - run: | - whoami - pwd - ls -a - - - name: Install dependencies - if: always() - run: | - pip install --user root -r ./requirements.txt - - - name: Install dependencies - if: always() - run: | - pip install coverage==7.3.2 - - - name: Install dependencies - if: always() - run: | - ls -a - pip install --user root coverage==7.3.2 - - name: Install dependencies if: always() run: | - ls -a - pip install --user coverage==7.3.2 + pip install -r requirements.txt - name: Type check run: mypy ./ --ignore-missing-imports From 5f52e09e70e05e7ecf5952b9a843975eeaca82fd Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Mon, 11 Dec 2023 14:43:03 +0530 Subject: [PATCH 24/50] added logs --- .github/workflows/ci.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ce44827..8380207 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -19,8 +19,14 @@ jobs: with: python-version: "3.10" - - name: Install dependencies + - name: Check current repo if: always() + run: | + whoami + pwd + ls -a + + - name: Install dependencies run: | pip install -r requirements.txt From 1fe02e2c3da45fe616e15bf1216fff3ac712def9 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Mon, 11 Dec 2023 16:16:03 +0530 Subject: [PATCH 25/50] added volumes --- .github/workflows/ci.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8380207..0d7cc9c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -9,7 +9,11 @@ on: jobs: run-ci: runs-on: ubuntu-latest - container: dipanshuwed/glue3.0 + container: + image: amazon/aws-glue-libs:glue_libs_4.0.0_image_01 + volumes: + - ${{ github.workspace }}:/home/glue_user/workspace + steps: - name: Checkout repository uses: actions/checkout@v3 From c596884c191529d0203796a8bfdff9b79b1f4806 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Mon, 11 Dec 2023 16:22:21 +0530 Subject: [PATCH 26/50] set user root --- .github/workflows/ci.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0d7cc9c..2cd9444 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -11,9 +11,8 @@ jobs: runs-on: ubuntu-latest container: image: amazon/aws-glue-libs:glue_libs_4.0.0_image_01 - volumes: - - ${{ github.workspace }}:/home/glue_user/workspace - + user: root + steps: - name: Checkout repository uses: actions/checkout@v3 From 4aab3339f49f05e25617d5639f746025c1a3aa7c Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Mon, 11 Dec 2023 16:25:24 +0530 Subject: [PATCH 27/50] ci debugging --- .github/workflows/ci.yml | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2cd9444..ebbc3a5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -9,19 +9,23 @@ on: jobs: run-ci: runs-on: ubuntu-latest - container: - image: amazon/aws-glue-libs:glue_libs_4.0.0_image_01 - user: root + container: amazon/aws-glue-libs:glue_libs_4.0.0_image_01 steps: - - name: Checkout repository - uses: actions/checkout@v3 + # - name: Checkout repository + # uses: actions/checkout@v3 - name: Set up Python uses: actions/setup-python@v2 with: python-version: "3.10" + - name: Run as root in the container + run: sudo chown -R root:root /home/glue_user/workspace + + - name: Checkout repository + uses: actions/checkout@v3 + - name: Check current repo if: always() run: | From b366e0bded418b56aa43bee0a01e4818d9d2245d Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Mon, 11 Dec 2023 16:31:50 +0530 Subject: [PATCH 28/50] ci debugging2 --- .github/workflows/ci.yml | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ebbc3a5..39909dc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -9,23 +9,19 @@ on: jobs: run-ci: runs-on: ubuntu-latest - container: amazon/aws-glue-libs:glue_libs_4.0.0_image_01 + container: + image: amazon/aws-glue-libs:glue_libs_4.0.0_image_01 + options: --user root steps: - # - name: Checkout repository - # uses: actions/checkout@v3 + - name: Checkout repository + uses: actions/checkout@v3 - name: Set up Python uses: actions/setup-python@v2 with: python-version: "3.10" - - name: Run as root in the container - run: sudo chown -R root:root /home/glue_user/workspace - - - name: Checkout repository - uses: actions/checkout@v3 - - name: Check current repo if: always() run: | From ffaba8be2a207b06327635471add8c25a555f411 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Mon, 11 Dec 2023 16:38:56 +0530 Subject: [PATCH 29/50] add pip upgrade --- .github/workflows/ci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 39909dc..bf3aa8f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -31,7 +31,8 @@ jobs: - name: Install dependencies run: | - pip install -r requirements.txt + python3 -m pip install --upgrade pip + python3 -m pip install -r requirements.txt - name: Type check run: mypy ./ --ignore-missing-imports From 131eefc14e9eb4e38cfab0eba45c97f8e82e2643 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Mon, 11 Dec 2023 16:50:44 +0530 Subject: [PATCH 30/50] using python 3.10 image --- .github/workflows/ci.yml | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bf3aa8f..5a1125f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -9,18 +9,16 @@ on: jobs: run-ci: runs-on: ubuntu-latest - container: - image: amazon/aws-glue-libs:glue_libs_4.0.0_image_01 - options: --user root + container: python:3.10 steps: - name: Checkout repository uses: actions/checkout@v3 - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: "3.10" + # - name: Set up Python + # uses: actions/setup-python@v2 + # with: + # python-version: "3.10" - name: Check current repo if: always() From 051bb5c4809247d501d51c44f8ccb803652d5c50 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Mon, 11 Dec 2023 17:15:57 +0530 Subject: [PATCH 31/50] using dipanshu image --- .github/workflows/ci.yml | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5a1125f..34204fd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -9,16 +9,18 @@ on: jobs: run-ci: runs-on: ubuntu-latest - container: python:3.10 + container: + image: dipanshuwed/glue3.0:latest + options: --cpus 1 steps: - name: Checkout repository uses: actions/checkout@v3 - # - name: Set up Python - # uses: actions/setup-python@v2 - # with: - # python-version: "3.10" + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: "3.10" - name: Check current repo if: always() @@ -28,10 +30,17 @@ jobs: ls -a - name: Install dependencies + if: always() run: | python3 -m pip install --upgrade pip python3 -m pip install -r requirements.txt + - name: Install dependencies + if: always() + run: | + pip install --upgrade pip + pip install -r requirements.txt + - name: Type check run: mypy ./ --ignore-missing-imports From 0bedbac538d6bbc15dfbc303287726a4d34dc585 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Mon, 11 Dec 2023 17:49:05 +0530 Subject: [PATCH 32/50] using vighnesh docker image --- .github/workflows/ci.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 34204fd..b6c1a91 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -9,9 +9,7 @@ on: jobs: run-ci: runs-on: ubuntu-latest - container: - image: dipanshuwed/glue3.0:latest - options: --cpus 1 + container: vighneshwed/glue3.0:latest steps: - name: Checkout repository From 5badfb991beac4451f8fedfce41f2ede98881c8e Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Mon, 11 Dec 2023 18:17:18 +0530 Subject: [PATCH 33/50] install glibc6 --- .github/workflows/ci.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b6c1a91..a2d1c22 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,6 +15,11 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 + - name: Install Glibc6 + run: | + sudo apt update + sudo apt install libc6 + - name: Set up Python uses: actions/setup-python@v2 with: From 3af8e00db93099801250eb308e7cca60ab5d1602 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Tue, 12 Dec 2023 09:54:14 +0530 Subject: [PATCH 34/50] added python statment --- .github/workflows/ci.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a2d1c22..50bc90b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,6 +21,7 @@ jobs: sudo apt install libc6 - name: Set up Python + if: always() uses: actions/setup-python@v2 with: python-version: "3.10" @@ -38,6 +39,12 @@ jobs: python3 -m pip install --upgrade pip python3 -m pip install -r requirements.txt + - name: Install dependencies + if: always() + run: | + python -m pip install --upgrade pip + python -m pip install -r requirements.txt + - name: Install dependencies if: always() run: | From 656d192a952eb48692795ad406b554c86f2263f3 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Tue, 12 Dec 2023 13:17:44 +0530 Subject: [PATCH 35/50] removed log statement --- app/Extraction.py | 1 - 1 file changed, 1 deletion(-) diff --git a/app/Extraction.py b/app/Extraction.py index c78bad8..13bfaf6 100644 --- a/app/Extraction.py +++ b/app/Extraction.py @@ -6,7 +6,6 @@ def extract_from_kaggle(flag: bool): if flag: - print("WRONG BIT RAN!!!!") read_path = "/dbfs/mnt/rawdata/" write_path = "/mnt/transformed/" else: From 8413f6f71d9b76c2476c871b0dd82e2d2a7c244d Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Tue, 12 Dec 2023 13:19:21 +0530 Subject: [PATCH 36/50] added Extraction tests --- tests/test_Extraction.py | 44 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 tests/test_Extraction.py diff --git a/tests/test_Extraction.py b/tests/test_Extraction.py new file mode 100644 index 0000000..ecca47a --- /dev/null +++ b/tests/test_Extraction.py @@ -0,0 +1,44 @@ +import unittest +from unittest.mock import patch +from app.Extraction import extract_from_kaggle + +class TestExtraction(unittest.TestCase): + + @patch('app.Extraction.kaggle') + def test_extract_from_kaggle_success(self, mock_kaggle): + mock_kaggle_instance = mock_kaggle + mock_api_instance = mock_kaggle_instance.KaggleApi.return_value + # Mocking authenticate and dataset_download_cli methods + mock_api_instance.authenticate.return_value = None + mock_api_instance.dataset_download_cli.return_value = None + + # Call the function to test with flag=True for success case + result = extract_from_kaggle(True) + + # Assertions + mock_kaggle_instance.KaggleApi.assert_called_once() + mock_api_instance.authenticate.assert_called_once() + mock_api_instance.dataset_download_cli.assert_called_once_with( + "mastmustu/insurance-claims-fraud-data", unzip=True, path="/dbfs/mnt/rawdata/" + ) + + self.assertEqual(result, ("/mnt/rawdata/", "/mnt/transformed/")) + + @patch('app.Extraction.kaggle') + def test_extract_from_kaggle_failure(self, mock_kaggle): + mock_kaggle_instance = mock_kaggle + mock_api_instance = mock_kaggle_instance.KaggleApi.return_value + # Mocking authenticate and dataset_download_cli methods + mock_api_instance.authenticate.side_effect = Exception("Simulated authentication failure") + + # Call the function to test with flag=False for failure case + with self.assertRaises(Exception) as context: + extract_from_kaggle(False) + + # Assertions + mock_kaggle_instance.KaggleApi.assert_called_once() + mock_api_instance.authenticate.assert_called_once() + mock_api_instance.dataset_download_cli.assert_not_called() + + # Check if the correct exception is raised + self.assertEqual(str(context.exception), "Simulated authentication failure") From c1434a94c45bf698fb8a7580c1e192f886ba7136 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Tue, 12 Dec 2023 15:03:02 +0530 Subject: [PATCH 37/50] changed check --- main.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/main.py b/main.py index b730912..4d7a112 100644 --- a/main.py +++ b/main.py @@ -14,22 +14,27 @@ # COMMAND ---------- -try: - import app.connect_databricks as cd # pylint: disable=ungrouped-imports - import json +# try: +# import app.connect_databricks as cd # pylint: disable=ungrouped-imports +# import json - # Comment the following line if running directly in cloud notebook - spark, dbutils = cd.init_databricks() +# # Comment the following line if running directly in cloud notebook +# spark, dbutils = cd.init_databricks() - with open("/dbfs/mnt/config/keys.json", encoding="utf-8") as file: - keys = json.load(file) +# with open("/dbfs/mnt/config/keys.json", encoding="utf-8") as file: +# keys = json.load(file) - flag = keys["flag"] -except: # pylint: disable=bare-except - flag = "False" +# flag = keys["flag"] +# except: # pylint: disable=bare-except +# flag = "False" -flag = bool(flag == "True") +# flag = bool(flag == "True") + +if 'spark' in locals(): + flag = True +else: + flag = False # COMMAND ---------- @@ -46,6 +51,7 @@ # COMMAND ---------- if flag: + import app.connect_databricks as cd # creating mounts cd.create_mount(dbutils, "zipdata", "/mnt/zipdata/") cd.create_mount(dbutils, "rawdata", "/mnt/rawdata/") From 3832b226f9a0dbd4d230cb7a06eef0295f8ef03b Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Tue, 12 Dec 2023 15:17:19 +0530 Subject: [PATCH 38/50] take keys from parameters --- main.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index 4d7a112..0250c57 100644 --- a/main.py +++ b/main.py @@ -34,19 +34,21 @@ if 'spark' in locals(): flag = True else: + spark = None + dbutils = None flag = False # COMMAND ---------- if flag: - os.environ["KAGGLE_USERNAME"] = keys["kaggle_username"] + os.environ["KAGGLE_USERNAME"] = dbutils.widgets.get("kaggle_username") - os.environ["KAGGLE_KEY"] = keys["kaggle_token"] + os.environ["KAGGLE_KEY"] = dbutils.widgets.get("kaggle_token") - os.environ["storage_account_name"] = keys["storage_account_name"] + os.environ["storage_account_name"] = dbutils.widgets.get("storage_account_name") - os.environ["datalake_access_key"] = keys["datalake_access_key"] + os.environ["datalake_access_key"] = dbutils.widgets.get("datalake_access_key") # COMMAND ---------- From 66f70b429618d767db55333f95f277f25d11b24f Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Wed, 13 Dec 2023 12:40:59 +0530 Subject: [PATCH 39/50] updated gitignore --- .gitignore | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 51ea74a..5f4ce98 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,5 @@ .env .coverage -app/__pycache__ -app/tests/__pycache__ -tests/__pycache__ +*__pycache__ temp +htmlcov From af813d4a39a2d64d9821cd37384e8f36eef782e1 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Wed, 13 Dec 2023 12:42:03 +0530 Subject: [PATCH 40/50] changed pip -> pip3 --- .github/workflows/ci.yml | 36 +++++++----------------------------- 1 file changed, 7 insertions(+), 29 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 50bc90b..efbfade 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,47 +15,25 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 - - name: Install Glibc6 - run: | - sudo apt update - sudo apt install libc6 - - - name: Set up Python - if: always() - uses: actions/setup-python@v2 - with: - python-version: "3.10" - - - name: Check current repo - if: always() - run: | - whoami - pwd - ls -a - - name: Install dependencies if: always() run: | - python3 -m pip install --upgrade pip - python3 -m pip install -r requirements.txt - - - name: Install dependencies - if: always() - run: | - python -m pip install --upgrade pip - python -m pip install -r requirements.txt + pip install --upgrade pip + pip install -r requirements.txt - name: Install dependencies if: always() run: | - pip install --upgrade pip - pip install -r requirements.txt + pip3 install --upgrade pip + pip3 install -r requirements.txt - name: Type check run: mypy ./ --ignore-missing-imports - name: Lint - run: pylint app tests main.py setup.py --output pylint-report.txt + run: | + pylint app tests main.py setup.py --output pylint-report.txt + pylint app tests main.py setup.py - name: Testing run: coverage run --source=app -m unittest discover -s tests From 8b81b98c58d775dcf28103a7eb12d8b1c885eb0b Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Wed, 13 Dec 2023 12:42:46 +0530 Subject: [PATCH 41/50] added types to check None --- app/SparkWrapper.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/app/SparkWrapper.py b/app/SparkWrapper.py index 2a0d98c..4816291 100644 --- a/app/SparkWrapper.py +++ b/app/SparkWrapper.py @@ -2,7 +2,9 @@ from pyspark.sql import Window, WindowSpec -def create_frame(sc: SparkSession, path: str): +def create_frame(sc: SparkSession | None, path: str): + if sc is None: + raise TypeError(f"{sc} is None. Pass Spark Session") df = sc.read.csv(path, inferSchema=True, header=True) return df From 565d8c9fa5b3a8150b03d8e4e66d6046d6ecb4b1 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Wed, 13 Dec 2023 12:43:21 +0530 Subject: [PATCH 42/50] overrides dbtuils & spark --- main.py | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/main.py b/main.py index 0250c57..0493b56 100644 --- a/main.py +++ b/main.py @@ -14,29 +14,29 @@ # COMMAND ---------- -# try: -# import app.connect_databricks as cd # pylint: disable=ungrouped-imports -# import json +try: + import app.connect_databricks as cd # pylint: disable=ungrouped-imports + import json -# # Comment the following line if running directly in cloud notebook -# spark, dbutils = cd.init_databricks() + # Comment the following line if running directly in cloud notebook + spark, dbutils = cd.init_databricks() -# with open("/dbfs/mnt/config/keys.json", encoding="utf-8") as file: -# keys = json.load(file) + with open("/dbfs/mnt/config/keys.json", encoding="utf-8") as file: + keys = json.load(file) -# flag = keys["flag"] -# except: # pylint: disable=bare-except -# flag = "False" + flag = keys["flag"] +except: # pylint: disable=bare-except + flag = "False" -# flag = bool(flag == "True") +flag = bool(flag == "True") -if 'spark' in locals(): - flag = True -else: - spark = None - dbutils = None - flag = False +# if 'spark' in locals(): +# flag = True +# else: +# spark = None +# dbutils = None +# flag = False # COMMAND ---------- @@ -54,6 +54,7 @@ # COMMAND ---------- if flag: import app.connect_databricks as cd + # creating mounts cd.create_mount(dbutils, "zipdata", "/mnt/zipdata/") cd.create_mount(dbutils, "rawdata", "/mnt/rawdata/") From ed76b0708f8282383fbc779bd6f389212000cb4d Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Wed, 13 Dec 2023 12:43:50 +0530 Subject: [PATCH 43/50] added new modules --- requirements.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/requirements.txt b/requirements.txt index aa3b92d..56c1177 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,6 @@ mypy==1.7.1 pylint==3.0.2 coverage==7.3.2 +python-dotenv==1.0.0 +kaggle==1.5.16 +pre-commit==3.6.0 From 2ca7e40e2dcad2bb94b7f4f02d73a519dcec8482 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Wed, 13 Dec 2023 12:44:39 +0530 Subject: [PATCH 44/50] test for Extraction --- tests/test_Extraction.py | 40 +++++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/tests/test_Extraction.py b/tests/test_Extraction.py index ecca47a..601d30d 100644 --- a/tests/test_Extraction.py +++ b/tests/test_Extraction.py @@ -2,9 +2,9 @@ from unittest.mock import patch from app.Extraction import extract_from_kaggle -class TestExtraction(unittest.TestCase): - @patch('app.Extraction.kaggle') +class TestExtraction(unittest.TestCase): + @patch("app.Extraction.kaggle") def test_extract_from_kaggle_success(self, mock_kaggle): mock_kaggle_instance = mock_kaggle mock_api_instance = mock_kaggle_instance.KaggleApi.return_value @@ -19,17 +19,47 @@ def test_extract_from_kaggle_success(self, mock_kaggle): mock_kaggle_instance.KaggleApi.assert_called_once() mock_api_instance.authenticate.assert_called_once() mock_api_instance.dataset_download_cli.assert_called_once_with( - "mastmustu/insurance-claims-fraud-data", unzip=True, path="/dbfs/mnt/rawdata/" + "mastmustu/insurance-claims-fraud-data", + unzip=True, + path="/dbfs/mnt/rawdata/", ) self.assertEqual(result, ("/mnt/rawdata/", "/mnt/transformed/")) - @patch('app.Extraction.kaggle') + @patch("app.Extraction.kaggle") + def test_extract_from_kaggle_success_false(self, mock_kaggle): + mock_kaggle_instance = mock_kaggle + mock_api_instance = mock_kaggle_instance.KaggleApi.return_value + # Mocking authenticate and dataset_download_cli methods + mock_api_instance.authenticate.return_value = None + mock_api_instance.dataset_download_cli.return_value = None + + # Call the function to test with flag=True for success case + result = extract_from_kaggle(False) + + # Assertions + mock_kaggle_instance.KaggleApi.assert_called_once() + mock_api_instance.authenticate.assert_called_once() + mock_api_instance.dataset_download_cli.assert_called_once_with( + "mastmustu/insurance-claims-fraud-data", unzip=True, path="temp/" + ) + + self.assertEqual( + result, + ( + "s3://glue-bucket-vighnesh/rawdata/", + "s3://glue-bucket-vighnesh/transformed/", + ), + ) + + @patch("app.Extraction.kaggle") def test_extract_from_kaggle_failure(self, mock_kaggle): mock_kaggle_instance = mock_kaggle mock_api_instance = mock_kaggle_instance.KaggleApi.return_value # Mocking authenticate and dataset_download_cli methods - mock_api_instance.authenticate.side_effect = Exception("Simulated authentication failure") + mock_api_instance.authenticate.side_effect = Exception( + "Simulated authentication failure" + ) # Call the function to test with flag=False for failure case with self.assertRaises(Exception) as context: From 1282f034be5e4afe055337735fe58cad6ef7c84e Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Wed, 13 Dec 2023 12:45:51 +0530 Subject: [PATCH 45/50] formate code --- tests/test_SparkWrapper.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_SparkWrapper.py b/tests/test_SparkWrapper.py index bc77b79..a907bf3 100644 --- a/tests/test_SparkWrapper.py +++ b/tests/test_SparkWrapper.py @@ -10,9 +10,7 @@ def setUp(self) -> None: SparkSession.builder.master("local").appName("Testing").getOrCreate() ) self.path = "tests/mock/sample.csv" - self.df = self.spark.read.csv( - self.path, inferSchema=True, header=True - ) + self.df = self.spark.read.csv(self.path, inferSchema=True, header=True) super().setUp() def tearDown(self) -> None: From ca6f967252a9f305a4d6ee35615174ccb23e46a0 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Wed, 13 Dec 2023 12:46:37 +0530 Subject: [PATCH 46/50] format code --- tests/test_SparkWrapperFailure.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_SparkWrapperFailure.py b/tests/test_SparkWrapperFailure.py index f1df05c..7425b42 100644 --- a/tests/test_SparkWrapperFailure.py +++ b/tests/test_SparkWrapperFailure.py @@ -12,9 +12,7 @@ def setUp(self) -> None: SparkSession.builder.appName("Testing").master("local[*]").getOrCreate() ) self.path = "tests/mock/sample.csv" - self.df = self.spark.read.csv( - self.path, inferSchema=True, header=True - ) + self.df = self.spark.read.csv(self.path, inferSchema=True, header=True) super().setUp() def tearDown(self) -> None: From 879cfaa6abedf7728283ab7fb82214c276921658 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Wed, 13 Dec 2023 13:33:33 +0530 Subject: [PATCH 47/50] using dipanshu glue 4 image --- .github/workflows/ci.yml | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index efbfade..cd34800 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -9,23 +9,17 @@ on: jobs: run-ci: runs-on: ubuntu-latest - container: vighneshwed/glue3.0:latest + container: dipanshuwed/glue4.0:latest steps: - name: Checkout repository uses: actions/checkout@v3 - name: Install dependencies - if: always() run: | - pip install --upgrade pip - pip install -r requirements.txt - - - name: Install dependencies - if: always() - run: | - pip3 install --upgrade pip + python3 -m pip install --upgrade pip pip3 install -r requirements.txt + yum install -y jq - name: Type check run: mypy ./ --ignore-missing-imports From 9dce4b38883039c1ca573845555daabe618d7f12 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Wed, 13 Dec 2023 13:34:08 +0530 Subject: [PATCH 48/50] changed to use nearby version --- requirements.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index 56c1177..d7770a0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -mypy==1.7.1 -pylint==3.0.2 -coverage==7.3.2 -python-dotenv==1.0.0 -kaggle==1.5.16 -pre-commit==3.6.0 +mypy~=1.7.1 +pylint~=3.0.2 +coverage~=7.3.2 +python-dotenv~=1.0.0 +kaggle~=1.5.16 +pre-commit~=3.6.0 From 9c35a8f189def302683e06c9e4e8b9322d293f06 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Wed, 13 Dec 2023 13:41:19 +0530 Subject: [PATCH 49/50] added mock kaggle keys --- .github/workflows/ci.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cd34800..740d5bd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,7 +30,10 @@ jobs: pylint app tests main.py setup.py - name: Testing - run: coverage run --source=app -m unittest discover -s tests + run: | + export KAGGLE_KEY=MOCKKEY + export KAGGLE_USERNAME=MOCKUSERNAME + coverage run --source=app -m unittest discover -s tests - name: Test coverage report run: | From 64e3733dd8330edcf102068f7f6e8c7f375e5008 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Wed, 13 Dec 2023 13:43:09 +0530 Subject: [PATCH 50/50] indent fix --- .github/workflows/ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 740d5bd..50ddf2a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -31,9 +31,9 @@ jobs: - name: Testing run: | - export KAGGLE_KEY=MOCKKEY - export KAGGLE_USERNAME=MOCKUSERNAME - coverage run --source=app -m unittest discover -s tests + export KAGGLE_KEY=MOCKKEY + export KAGGLE_USERNAME=MOCKUSERNAME + coverage run --source=app -m unittest discover -s tests - name: Test coverage report run: |