diff --git a/.drone.yml b/.drone.yml deleted file mode 100644 index a3f0f20..0000000 --- a/.drone.yml +++ /dev/null @@ -1,33 +0,0 @@ -kind: pipeline -name: default -type: docker - - -workspace: - base: /go - path: src/github.com/oliver006/elasticsearch-gmail - - -services: - - name: es - image: docker.elastic.co/elasticsearch/elasticsearch-oss:7.6.1 - environment: - http.port: "9200" - discovery.type: "single-node" - ports: - - 9200 - - -steps: - - name: tests - image: "python:3.7" - pull: always - commands: - - pip3 install -r requirements.txt - - sleep 30 - - curl -s http://es:9200 - - "python3 src/index_emails.py --infile=sample.mbox --es-url=http://es:9200" - when: - event: - - pull_request - - push diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..d867051 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,42 @@ +name: CI + +on: + push: + branches: + - master + pull_request: + +jobs: + tests: + runs-on: ubuntu-latest + services: + es: + image: docker.elastic.co/elasticsearch/elasticsearch-oss:7.10.2 + ports: + - 9200:9200 + options: >- + --env http.port=9200 + --env discovery.type=single-node + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: 3.12 + + - name: Install dependencies + run: | + pip install --upgrade pip + pip install -r requirements.txt + + - name: Wait for Elasticsearch + run: | + sleep 10 + curl -s http://localhost:9200 + + - name: Run tests + run: python3 src/index_emails.py --infile=sample.mbox --es-url=http://localhost:9200 + diff --git a/README.md b/README.md index c744744..c56f4d0 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ Set up [Elasticsearch](https://www.elastic.co/guide/en/elasticsearch/guide/curre A quick way to run Elasticsearch is using Docker: (the cors settings aren't really needed but come in handy if you want to use e.g. [dejavu](https://dejavu.appbase.io/) to explore the index) ``` -docker run --name es -d -p 9200:9200 -e http.port=9200 -e http.cors.enabled=true -e 'http.cors.allow-origin=*' -e http.cors.allow-headers=X-Requested-With,X-Auth-Token,Content-Type,Content-Length,Authorization -e http.cors.allow-credentials=true -e "discovery.type=single-node" docker.elastic.co/elasticsearch/elasticsearch-oss:7.6.1 +docker run --name es -d -p 9200:9200 -e http.port=9200 -e http.cors.enabled=true -e 'http.cors.allow-origin=*' -e http.cors.allow-headers=X-Requested-With,X-Auth-Token,Content-Type,Content-Length,Authorization -e http.cors.allow-credentials=true -e "discovery.type=single-node" docker.elastic.co/elasticsearch/elasticsearch-oss:7.10.2 ``` I use Python and [Tornado](https://github.com/tornadoweb/tornado/) for the scripts to import and query the data. Also `beautifulsoup4` for the stripping HTML/JS/CSS (if you want to use the body indexing flag). diff --git a/src/index_emails.py b/src/index_emails.py index 0c2b316..eb99cc5 100644 --- a/src/index_emails.py +++ b/src/index_emails.py @@ -1,4 +1,4 @@ -from tornado.httpclient import HTTPClient, HTTPRequest +from tornado.httpclient import AsyncHTTPClient, HTTPRequest from tornado.ioloop import IOLoop import tornado.options import json @@ -12,7 +12,7 @@ from bs4 import BeautifulSoup import logging -http_client = HTTPClient() +http_client = AsyncHTTPClient() DEFAULT_BATCH_SIZE = 500 DEFAULT_ES_URL = "http://localhost:9200" @@ -34,17 +34,17 @@ def strip_html_css_js(msg): return text -def delete_index(): +async def delete_index(): try: url = "%s/%s" % (tornado.options.options.es_url, tornado.options.options.index_name) request = HTTPRequest(url, method="DELETE", request_timeout=240, headers={"Content-Type": "application/json"}) - response = http_client.fetch(request) + response = await http_client.fetch(request) logging.info('Delete index done %s' % response.body) except: pass -def create_index(): +async def create_index(): schema = { "settings": { @@ -71,7 +71,7 @@ def create_index(): url = "%s/%s" % (tornado.options.options.es_url, tornado.options.options.index_name) try: request = HTTPRequest(url, method="PUT", body=body, request_timeout=240, headers={"Content-Type": "application/json"}) - response = http_client.fetch(request) + response = await http_client.fetch(request) logging.info('Create index done %s' % response.body) except: pass @@ -80,7 +80,7 @@ def create_index(): total_uploaded = 0 -def upload_batch(upload_data): +async def upload_batch(upload_data): if tornado.options.options.dry_run: logging.info("Dry run, not uploading") return @@ -97,7 +97,7 @@ def upload_batch(upload_data): upload_data_txt += json_item request = HTTPRequest(tornado.options.options.es_url + "/_bulk", method="POST", body=upload_data_txt, request_timeout=240, headers={"Content-Type": "application/json"}) - response = http_client.fetch(request) + response = await http_client.fetch(request) result = json.loads(response.body) global total_uploaded @@ -170,11 +170,11 @@ def parse_message_parts(current_msg): return result -def load_from_file(): +async def load_from_file(): if tornado.options.options.init: - delete_index() - create_index() + await delete_index() + await create_index() if tornado.options.options.skip: logging.info("Skipping first %d messages" % tornado.options.options.skip) @@ -198,12 +198,12 @@ def load_from_file(): if item: upload_data.append(item) if len(upload_data) == tornado.options.options.batch_size: - upload_batch(upload_data) + await upload_batch(upload_data) upload_data = list() # upload remaining items in `upload_batch` if upload_data: - upload_batch(upload_data) + await upload_batch(upload_data) logging.info("Import done - total count %d" % len(mbox.keys())) @@ -249,7 +249,7 @@ def load_from_file(): tornado.options.parse_command_line() - #Exactly one of {infile, indir} must be set + # Exactly one of {infile, indir} must be set if bool(tornado.options.options.infile) ^ bool(tornado.options.options.indir): IOLoop.instance().run_sync(load_from_file) else: