-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
99fad47
commit 174d61a
Showing
10 changed files
with
247 additions
and
250 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
# this is my custom file for read & write path based on environment | ||
|
||
GLUE_READ_PATH = "s3://glue-bucket-vighnesh/rawdata/" | ||
GLUE_WRITE_PATH = "s3://glue-bucket-vighnesh/transformed/" | ||
|
||
DATABRICKS_READ_PATH = "/mnt/rawdata/" | ||
DATABRICKS_WRITE_PATH = "/mnt/transformed/" | ||
|
||
KAGGLE_PATH = "mastmustu/insurance-claims-fraud-data" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,21 @@ | ||
import sys | ||
from pyspark.context import SparkContext | ||
from awsglue.utils import getResolvedOptions, GlueArgumentError | ||
from awsglue.context import GlueContext | ||
from awsglue.job import Job | ||
|
||
|
||
def init_glue(): | ||
try: | ||
args = getResolvedOptions( | ||
sys.argv, ["JOB_NAME", "KAGGLE_USERNAME", "KAGGLE_KEY"] | ||
) | ||
print("\nRunning Glue Online\n") | ||
except GlueArgumentError: | ||
print("\nRunning Glue Locally\n") | ||
args = {"JOB_NAME": "local"} | ||
|
||
sc = SparkContext() | ||
glueContext = GlueContext(sc) | ||
spark = glueContext.spark_session | ||
job = Job(glueContext) | ||
|
||
return glueContext, spark, job | ||
return spark, args |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
import os | ||
import subprocess | ||
import dotenv | ||
import app.connect_databricks as cd | ||
import app.connect_glue as cg | ||
import app.spark_wrapper as sw | ||
|
||
|
||
def set_keys_get_spark(databricks: bool, dbutils, spark): | ||
if databricks: | ||
os.environ["KAGGLE_USERNAME"] = dbutils.widgets.get("kaggle_username") | ||
|
||
os.environ["KAGGLE_KEY"] = dbutils.widgets.get("kaggle_token") | ||
|
||
os.environ["storage_account_name"] = dbutils.widgets.get("storage_account_name") | ||
|
||
os.environ["datalake_access_key"] = dbutils.widgets.get("datalake_access_key") | ||
|
||
cd.create_mount(dbutils, "rawdata", "/mnt/rawdata/") | ||
cd.create_mount(dbutils, "transformed", "/mnt/transformed/") | ||
|
||
else: | ||
spark, args = cg.init_glue() | ||
if args["JOB_NAME"] == "local": | ||
dotenv.load_dotenv() | ||
else: | ||
os.environ["KAGGLE_USERNAME"] = args["KAGGLE_USERNAME"] | ||
os.environ["KAGGLE_KEY"] = args["KAGGLE_KEY"] | ||
|
||
return spark | ||
|
||
|
||
def get_dataframes(databricks: bool, spark, directory_path: str): | ||
dfs = [] | ||
|
||
if databricks: | ||
csv_files = [ | ||
file for file in os.listdir(directory_path) if file.endswith(".csv") | ||
] | ||
else: | ||
cmd = f"aws s3 ls {directory_path}" | ||
result = subprocess.run( | ||
cmd, | ||
stdout=subprocess.PIPE, | ||
stderr=subprocess.PIPE, | ||
text=True, | ||
shell=True, | ||
check=True, | ||
) | ||
lines = result.stdout.split("\n") | ||
csv_files = [line.split()[-1] for line in lines if line.endswith(".csv")] | ||
|
||
for csv_file in csv_files: | ||
file_path = os.path.join(directory_path, csv_file) | ||
df = sw.create_frame(spark, file_path) | ||
dfs.append(df) | ||
|
||
return dfs | ||
|
||
|
||
def get_paths(databricks: bool): | ||
if databricks: | ||
return os.getenv("DATABRICKS_READ_PATH"), os.getenv("DATABRICKS_WRITE_PATH") | ||
|
||
return os.getenv("GLUE_READ_PATH"), os.getenv("GLUE_WRITE_PATH") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,26 +1,19 @@ | ||
import os | ||
import kaggle | ||
|
||
os.system("pip install kaggle") | ||
import kaggle # pylint: disable=wrong-import-position | ||
|
||
|
||
def extract_from_kaggle(flag: bool): | ||
if flag: | ||
read_path = "/dbfs/mnt/rawdata/" | ||
write_path = "/mnt/transformed/" | ||
def extract_from_kaggle(databricks: bool, extraction_path: str): | ||
if databricks: | ||
temp_path = "/dbfs" + extraction_path | ||
else: | ||
read_path = "temp/" | ||
write_path = "s3://glue-bucket-vighnesh/transformed/" | ||
temp_path = "temp/" | ||
|
||
api = kaggle.KaggleApi() | ||
api.authenticate() | ||
api.dataset_download_cli( | ||
"mastmustu/insurance-claims-fraud-data", unzip=True, path=read_path | ||
) | ||
api.dataset_download_cli(os.getenv("KAGGLE_PATH"), unzip=True, path=temp_path) | ||
|
||
if flag: | ||
read_path = read_path[5:] | ||
else: | ||
read_path = "s3://glue-bucket-vighnesh/rawdata/" | ||
if databricks is False: | ||
copy_command = f"aws s3 cp {temp_path} {extraction_path} --recursive" | ||
os.system(copy_command) | ||
|
||
return read_path, write_path | ||
print(f"Extracted Data Successfully in path: {extraction_path}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.