Skip to content

Commit

Permalink
github actions, update dependencies, fix upload by using AsyncHTTPClient
Browse files Browse the repository at this point in the history
  • Loading branch information
oliver006 committed Oct 14, 2024
1 parent 25e9ccc commit 357472a
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 48 deletions.
33 changes: 0 additions & 33 deletions .drone.yml

This file was deleted.

42 changes: 42 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
name: CI

on:
push:
branches:
- master
pull_request:

jobs:
tests:
runs-on: ubuntu-latest
services:
es:
image: docker.elastic.co/elasticsearch/elasticsearch-oss:7.10.2
ports:
- 9200:9200
options: >-
--env http.port=9200
--env discovery.type=single-node
steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: 3.12

- name: Install dependencies
run: |
pip install --upgrade pip
pip install -r requirements.txt
- name: Wait for Elasticsearch
run: |
sleep 10
curl -s http://localhost:9200
- name: Run tests
run: python3 src/index_emails.py --infile=sample.mbox --es-url=http://localhost:9200

2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ Set up [Elasticsearch](https://www.elastic.co/guide/en/elasticsearch/guide/curre

A quick way to run Elasticsearch is using Docker: (the cors settings aren't really needed but come in handy if you want to use e.g. [dejavu](https://dejavu.appbase.io/) to explore the index)
```
docker run --name es -d -p 9200:9200 -e http.port=9200 -e http.cors.enabled=true -e 'http.cors.allow-origin=*' -e http.cors.allow-headers=X-Requested-With,X-Auth-Token,Content-Type,Content-Length,Authorization -e http.cors.allow-credentials=true -e "discovery.type=single-node" docker.elastic.co/elasticsearch/elasticsearch-oss:7.6.1
docker run --name es -d -p 9200:9200 -e http.port=9200 -e http.cors.enabled=true -e 'http.cors.allow-origin=*' -e http.cors.allow-headers=X-Requested-With,X-Auth-Token,Content-Type,Content-Length,Authorization -e http.cors.allow-credentials=true -e "discovery.type=single-node" docker.elastic.co/elasticsearch/elasticsearch-oss:7.10.2
```

I use Python and [Tornado](https://github.com/tornadoweb/tornado/) for the scripts to import and query the data. Also `beautifulsoup4` for the stripping HTML/JS/CSS (if you want to use the body indexing flag).
Expand Down
28 changes: 14 additions & 14 deletions src/index_emails.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from tornado.httpclient import HTTPClient, HTTPRequest
from tornado.httpclient import AsyncHTTPClient, HTTPRequest
from tornado.ioloop import IOLoop
import tornado.options
import json
Expand All @@ -12,7 +12,7 @@
from bs4 import BeautifulSoup
import logging

http_client = HTTPClient()
http_client = AsyncHTTPClient()

DEFAULT_BATCH_SIZE = 500
DEFAULT_ES_URL = "http://localhost:9200"
Expand All @@ -34,17 +34,17 @@ def strip_html_css_js(msg):
return text


def delete_index():
async def delete_index():
try:
url = "%s/%s" % (tornado.options.options.es_url, tornado.options.options.index_name)
request = HTTPRequest(url, method="DELETE", request_timeout=240, headers={"Content-Type": "application/json"})
response = http_client.fetch(request)
response = await http_client.fetch(request)
logging.info('Delete index done %s' % response.body)
except:
pass


def create_index():
async def create_index():

schema = {
"settings": {
Expand All @@ -71,7 +71,7 @@ def create_index():
url = "%s/%s" % (tornado.options.options.es_url, tornado.options.options.index_name)
try:
request = HTTPRequest(url, method="PUT", body=body, request_timeout=240, headers={"Content-Type": "application/json"})
response = http_client.fetch(request)
response = await http_client.fetch(request)
logging.info('Create index done %s' % response.body)
except:
pass
Expand All @@ -80,7 +80,7 @@ def create_index():
total_uploaded = 0


def upload_batch(upload_data):
async def upload_batch(upload_data):
if tornado.options.options.dry_run:
logging.info("Dry run, not uploading")
return
Expand All @@ -97,7 +97,7 @@ def upload_batch(upload_data):
upload_data_txt += json_item

request = HTTPRequest(tornado.options.options.es_url + "/_bulk", method="POST", body=upload_data_txt, request_timeout=240, headers={"Content-Type": "application/json"})
response = http_client.fetch(request)
response = await http_client.fetch(request)
result = json.loads(response.body)

global total_uploaded
Expand Down Expand Up @@ -170,11 +170,11 @@ def parse_message_parts(current_msg):
return result


def load_from_file():
async def load_from_file():

if tornado.options.options.init:
delete_index()
create_index()
await delete_index()
await create_index()

if tornado.options.options.skip:
logging.info("Skipping first %d messages" % tornado.options.options.skip)
Expand All @@ -198,12 +198,12 @@ def load_from_file():
if item:
upload_data.append(item)
if len(upload_data) == tornado.options.options.batch_size:
upload_batch(upload_data)
await upload_batch(upload_data)
upload_data = list()

# upload remaining items in `upload_batch`
if upload_data:
upload_batch(upload_data)
await upload_batch(upload_data)

logging.info("Import done - total count %d" % len(mbox.keys()))

Expand Down Expand Up @@ -249,7 +249,7 @@ def load_from_file():

tornado.options.parse_command_line()

#Exactly one of {infile, indir} must be set
# Exactly one of {infile, indir} must be set
if bool(tornado.options.options.infile) ^ bool(tornado.options.options.indir):
IOLoop.instance().run_sync(load_from_file)
else:
Expand Down

0 comments on commit 357472a

Please sign in to comment.