Skip to content

Commit

Permalink
scraper: minor changes
Browse files Browse the repository at this point in the history
- Naming
- Arguments are now checked before clients and gateways are instantiated
- Updated readme
  • Loading branch information
alebg committed Mar 17, 2024
1 parent 4a0eac8 commit eb28e0e
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 19 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ Now you can run the main scraper script with the following command.
All parameters have the default values stated below:

```bash
docker exec -it mpi-telegram-scraper python3 telegram_scraper.py --log-level=WARNING --job_id=1 --tracer_id="1" --channel_name="sda_test"
docker exec -it mpi-telegram-scraper python3 telegram_scraper.py --log-level=WARNING --job-id=1 --tracer-id="1" --channel-name="sda_test"
```

Change `--log-level` to `INFO` to see more detailed logs.
Expand Down Expand Up @@ -60,7 +60,7 @@ docker compose -f minio-docker-compose.yml up -d

After doing the setup, you can now execute the main scraper script. All parameters are optional, and below are the default values:
```bash
python3 telegram_scraper.py --log-level=WARNING --job_id=1 --tracer_id="1" --channel_name="sda_test"
python3 telegram_scraper.py --log-level=WARNING --job-id=1 --tracer-id="1" --channel-name="sda_test"
```

If everything is set up correctly, the Telegram client will send a verification code to the phone number you provided. You will need to enter this code in the terminal to continue.
Expand Down Expand Up @@ -92,7 +92,7 @@ docker run --rm \
And now, to run the main scraper script:

```bash
docker exec -it mpi-telegram-scraper python3 telegram_scraper.py --log-level=WARNING --job_id=1 --tracer_id="1" --channel_name="sda_test"
docker exec -it mpi-telegram-scraper python3 telegram_scraper.py --log-level=WARNING --job-id=1 --tracer-id="1" --channel-name="sda_test"
```

Change `--log-level` to `INFO` to see more detailed logs.
33 changes: 17 additions & 16 deletions telegram_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
logger = logging.getLogger(__name__)


def _kernel_planckster_setup(
def _setup_kernel_planckster(
job_id: int,
) -> KernelPlancksterGateway:

Expand Down Expand Up @@ -52,7 +52,7 @@ def _kernel_planckster_setup(
raise error


def _minio_repository_setup(
def _setup_minio_repository(
job_id: int,
) -> MinIORepository:

Expand Down Expand Up @@ -91,7 +91,7 @@ def _minio_repository_setup(
raise error


def _telegram_client_setup(
def _setup_telegram_client(
job_id: int,
) -> TelegramClient:
try:
Expand Down Expand Up @@ -153,7 +153,7 @@ def _setup(
dotenv_path=".env",
)

kernel_planckster = _kernel_planckster_setup(job_id)
kernel_planckster = _setup_kernel_planckster(job_id)

# Check protocol and setup the MinIO Repository if using s3
# s3 by default
Expand All @@ -168,9 +168,9 @@ def _setup(

minio_repository = None
if protocol == Protocol.S3:
minio_repository = _minio_repository_setup(job_id)
minio_repository = _setup_minio_repository(job_id)

telegram_client = _telegram_client_setup(job_id)
telegram_client = _setup_telegram_client(job_id)

return kernel_planckster, protocol, minio_repository, telegram_client

Expand All @@ -192,11 +192,6 @@ async def _scrape(

try:

if not all([channel_name, tracer_id]):
logger.error(f"{job_id}: channel_name and tracer_id must be set.")
raise ValueError("channel_name and tracer_id must be set.")


output_lfns: List[LFN] = []
async with telegram_client as client:

Expand Down Expand Up @@ -434,8 +429,14 @@ def main(
log_level: str = "WARNING",
) -> None:


logging.basicConfig(level=log_level)

if not all([job_id, channel_name, tracer_id]):
logger.error(f"{job_id}: job_id, tracer_id, and channel_name must all be set.")
raise ValueError("job_id, tracer_id, and channel_name must all be set.")


kernel_planckster, protocol, minio_repository, telegram_client = _setup(job_id)

if protocol == Protocol.S3 and minio_repository is None:
Expand All @@ -455,6 +456,7 @@ def main(
telegram_client=telegram_client,
)
)

loop.close()


Expand All @@ -465,22 +467,21 @@ def main(
parser = argparse.ArgumentParser(description="Scrape data from a telegram channel.")

parser.add_argument(
"--job_id",
"--job-id",
type=str,
default="1",
help="The job id",
)

parser.add_argument(
"--channel_name",
"--channel-name",
type=str,
#default="GCC_report",
default="sda_test",
default="GCC_report",
help="The channel name",
)

parser.add_argument(
"--tracer_id",
"--tracer-id",
type=str,
default="1",
help="The tracer id",
Expand Down

0 comments on commit eb28e0e

Please sign in to comment.