diff --git a/container/Dockerfile b/container/Dockerfile index 6c29cb7..6718322 100644 --- a/container/Dockerfile +++ b/container/Dockerfile @@ -1,14 +1,29 @@ -FROM continuumio/miniconda +FROM ubuntu:14.04 RUN apt-get update +RUN apt-get install -y build-essential wget software-properties-common + +RUN wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh && \ + bash Miniconda2-latest-Linux-x86_64.sh -b + +ENV PATH /root/miniconda2/bin/:$PATH + +RUN /root/miniconda2/bin/conda install -y numpy scipy RUN cd home && \ - wget https://github.com/minhptx/iswc-2016-semantic-labeling/archive/v0.1-alpha.tar.gz && \ + wget https://github.com/minhptx/iswc-2016-semantic-labeling/archive/v0.1-alpha.3.tar.gz && \ mkdir semantic-labeling && \ - tar -xf v0.1-alpha.tar.gz -C semantic-labeling --strip-components=1 && \ - rm v0.1-alpha.tar.gz + tar -xf v0.1-alpha.3.tar.gz -C semantic-labeling --strip-components=1 && \ + rm v0.1-alpha.3.tar.gz -RUN conda install numpy scipy RUN pip install -r requirements.txt +RUN apt-get install -y +RUN add-apt-repository ppa:webupd8team/java -y +RUN apt-get update +RUN echo debconf shared/accepted-oracle-license-v1-1 select true | sudo debconf-set-selections && \ + echo debconf shared/accepted-oracle-license-v1-1 seen true | sudo debconf-set-selections +RUN apt-get install oracle-java8-installer + WORKDIR /home/semantic-labeling + diff --git a/container/docker-compose.yml b/container/docker-compose.yml index b9ce60e..af7ef44 100644 --- a/container/docker-compose.yml +++ b/container/docker-compose.yml @@ -6,55 +6,55 @@ services: ports: - 9200:9200 volumes: - - ../docker-data/esdata:/usr/share/elasticsearch/data - ./es_config:/usr/share/elasticsearch/config - master: - image: gettyimages/spark - command: bin/spark-class org.apache.spark.deploy.master.Master -h master - hostname: master - environment: - MASTER: spark://master:7077 - SPARK_CONF_DIR: /conf - SPARK_PUBLIC_DNS: localhost - expose: - - 7001 - - 7002 - - 7003 - - 7004 - - 7005 - - 7006 - - 7077 - - 6066 - ports: - - 4040:4040 - - 6066:6066 - - 7077:7077 - - 8080:8080 - volumes: - - ../docker-data/master/conf:/conf - - ../docker-data/master:/tmp/data - worker: - image: gettyimages/spark - command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://master:7077 - hostname: worker - environment: - SPARK_CONF_DIR: /conf - SPARK_WORKER_CORES: 2 - SPARK_WORKER_MEMORY: 1g - SPARK_WORKER_PORT: 8881 - SPARK_WORKER_WEBUI_PORT: 8081 - SPARK_PUBLIC_DNS: localhost - links: - - master - expose: - - 7012 - - 7013 - - 7014 - - 7015 - - 7016 - - 8881 - ports: - - 8081:8081 - volumes: - - ../docker-data/worker/conf:/conf - - ../docker-data/worker:/tmp/data +# - ../docker-data/esdata:/usr/share/elasticsearch/data +# master: +# image: gettyimages/spark:2.2.0-hadoop-2.7 +# command: bin/spark-class org.apache.spark.deploy.master.Master -h master +# hostname: master +# environment: +# MASTER: spark://master:7077 +# SPARK_CONF_DIR: /conf +# SPARK_PUBLIC_DNS: localhost +# expose: +# - 7001 +# - 7002 +# - 7003 +# - 7004 +# - 7005 +# - 7006 +# - 7077 +# - 6066 +# ports: +# - 4040:4040 +# - 6066:6066 +# - 7077:7077 +# - 8080:8080 +# volumes: +# - ../docker-data/master/conf:/conf +# - ../docker-data/master:/tmp/data +# worker: +# image: gettyimages/spark:2.2.0-hadoop-2.7 +# command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://master:7077 +# hostname: worker +# environment: +# SPARK_CONF_DIR: /conf +# SPARK_WORKER_CORES: 2 +# SPARK_WORKER_MEMORY: 1g +# SPARK_WORKER_PORT: 8881 +# SPARK_WORKER_WEBUI_PORT: 8081 +# SPARK_PUBLIC_DNS: localhost +# links: +# - master +# expose: +# - 7012 +# - 7013 +# - 7014 +# - 7015 +# - 7016 +# - 8881 +# ports: +# - 8081:8081 +# volumes: +# - ../docker-data/worker/conf:/conf +# - ../docker-data/worker:/tmp/data diff --git a/main/api.py b/main/api.py index b3261fe..8cb536d 100644 --- a/main/api.py +++ b/main/api.py @@ -1,20 +1,19 @@ #!/usr/bin/python # -*- coding: utf-8 -*- -import os, logging - -import ujson - +import logging +import os import sys -from typing import Dict, Tuple, List, Set, Union, Optional +import ujson from elasticsearch import Elasticsearch + from main.semantic_labeler import SemanticLabeler """API for semantic labeling, a dataset is a set of sources""" def get_logger(name): - logger = logging.getLogger() + logger = logging.getLogger(name) logger.setLevel(logging.INFO) logger.propagate = False diff --git a/requirements.txt b/requirements.txt index 89761f4..d638e7a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,6 @@ gensim==3.2.0 -pyspark==2.2.0 \ No newline at end of file +pyspark==2.2.1 +elasticsearch==6.0.0 +ujson +scikit-learn=0.19.1 +pandas==0.22.0 \ No newline at end of file