From 875188dc6fda073df1c4d83e4b179fb6114c07de Mon Sep 17 00:00:00 2001 From: Saba Date: Sun, 23 Jan 2022 14:57:28 -0500 Subject: [PATCH 1/9] Initialize working on #20 to add Docker support - Add a Dockerfile which uses an Ubuntu image to install relevant dependencies (exif) and uses a Miniconda image for setting up/reusing the conda environment - Add a dummy docker-compose file --- Dockerfile | 21 +++++++++++++++++++++ docker-compose.yml | 0 2 files changed, 21 insertions(+) create mode 100644 Dockerfile create mode 100644 docker-compose.yml diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..b3e71fa6 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,21 @@ +# syntax=docker/dockerfile:1 +FROM ubuntu:18.04 + +# Install system dependencies and Python packages +RUN apt-get update -y && \ + apt-get -y install libimage-exiftool-perl + +FROM continuumio/miniconda3 + +COPY . /src +WORKDIR /src + +COPY environment.yml . +COPY config.yml . + +RUN conda env create -f environment.yml + +EXPOSE 5000 +COPY . . +# CMD python3 -m main -c=config.yml -vv +CMD ["conda", "run", "--name", "semantic-search", "python3", "-m", "main", "-c=config.yml", "-vv"] diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 00000000..e69de29b From 77fa8718d9837364871709cbb266da69ef763f62 Mon Sep 17 00:00:00 2001 From: Saba Date: Sun, 23 Jan 2022 23:44:38 -0500 Subject: [PATCH 2/9] Working example with docker-compose Still need quite a bit of clean-up, but this adds a working docker-compose + Dockerfile setup --- Dockerfile | 26 ++++++++++++++++-------- docker-compose.yml | 17 ++++++++++++++++ docker_sample_config.yml | 44 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 78 insertions(+), 9 deletions(-) create mode 100644 docker_sample_config.yml diff --git a/Dockerfile b/Dockerfile index b3e71fa6..84504e02 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,17 +5,25 @@ FROM ubuntu:18.04 RUN apt-get update -y && \ apt-get -y install libimage-exiftool-perl -FROM continuumio/miniconda3 +FROM continuumio/miniconda3:4.10.3p0-alpine -COPY . /src -WORKDIR /src +COPY . . -COPY environment.yml . -COPY config.yml . +# Get the arguments from the docker-compose environment +ARG PORT +EXPOSE ${PORT} +# This allows us to use the arguments during runtime RUN conda env create -f environment.yml -EXPOSE 5000 -COPY . . -# CMD python3 -m main -c=config.yml -vv -CMD ["conda", "run", "--name", "semantic-search", "python3", "-m", "main", "-c=config.yml", "-vv"] +# Use the conda environment we created to run the application. +# The docker execution process run conda activate semantic-search, since the lifetime of the environment would only be for the single command. +# Instead, we'll use the conda run to run the application. +# Use 0.0.0.0 to explicitly set the host ip for the service on the container. https://pythonspeed.com/articles/docker-connection-refused/ +# Use sh -c to start a shell in order to use environment variables in CMD. +ENTRYPOINT ["conda", "run", "--no-capture-output", "--name", "semantic-search", \ + "python3", "-m", "src.main"] + + # "python3", "-m", "src.main", "-c=${CONFIG_FILE}", "-vv" ,"--host=${HOST}, "--port=${PORT}"] + +# CMD ["sh", "-c", "echo ${CONFIG_FILE}", "echo ${HOST}", "echo ${PORT}"] diff --git a/docker-compose.yml b/docker-compose.yml index e69de29b..40019fa5 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -0,0 +1,17 @@ +version: "3.9" +services: + web: + build: + context: . + args: + - PORT=8000 + ports: + - "8000:8000" + volumes: + - .:/code + - /home/saba/notes/:/data/notes/ + - /home/saba/embeddings/:/data/generated/ + - /home/saba/images/:/data/images/ + - /home/saba/ledger/:/data/ledger/ + - /home/saba/music/:/data/music/ + command: --host="0.0.0.0" --port=8000 -c=docker_sample_config.yml diff --git a/docker_sample_config.yml b/docker_sample_config.yml new file mode 100644 index 00000000..8abe0209 --- /dev/null +++ b/docker_sample_config.yml @@ -0,0 +1,44 @@ +content-type: + org: + input-files: null + input-filter: "/data/notes/*.org" + compressed-jsonl: "/data/generated/.notes.json.gz" + embeddings-file: "/data/generated/.note_embeddings.pt" + + ledger: + # input-files: null + # input-filter: /data/ledger/*.beancount + # compressed-jsonl: /data/generated/.transactions.jsonl.gz + # embeddings-file: /data/generated/.transaction_embeddings.pt + + image: + # input-directory: "/data/images/" + # embeddings-file: "/data/generated/.image_embeddings.pt" + # batch-size: 50 + # use-xmp-metadata: "no" + + music: + # input-files: null + # input-filter: "/data/music/*.org" + # compressed-jsonl: "/data/generated/.songs.jsonl.gz" + # embeddings-file: "/data/generated/.song_embeddings.pt" + +search-type: + symmetric: + encoder: "sentence-transformers/paraphrase-MiniLM-L6-v2" + cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2" + model_directory: "/data/models/.symmetric" + + asymmetric: + encoder: "sentence-transformers/msmarco-MiniLM-L-6-v3" + cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2" + model_directory: "/data/models/.asymmetric" + + image: + encoder: "clip-ViT-B-32" + model_directory: "/data/models/.image_encoder" + +processor: + conversation: + openai-api-key: null + conversation-logfile: "/data/conversation/.conversation_logs.json" \ No newline at end of file From 66d08ab5dfb996b54f7341b449fd056ebad11756 Mon Sep 17 00:00:00 2001 From: Saba Date: Mon, 24 Jan 2022 00:14:01 -0500 Subject: [PATCH 3/9] Rename web to server in docker-compose.yml --- docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index 40019fa5..d64f1735 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,6 +1,6 @@ version: "3.9" services: - web: + server: build: context: . args: From 4ae8c15170c016738d95d0cd0c368421a3373c2c Mon Sep 17 00:00:00 2001 From: Saba Date: Mon, 24 Jan 2022 14:08:55 -0500 Subject: [PATCH 4/9] Clean the Dockerfile - Use /app as the working directory - Clarify comment to explain why the ENTRYPOINT is constructed as it is - Move explanations for the argument to docker-compose, where it's set - Copy required artifacts from the first build image into the subsequent one (exiftool) --- Dockerfile | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/Dockerfile b/Dockerfile index 84504e02..ccc76307 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,29 +1,34 @@ # syntax=docker/dockerfile:1 -FROM ubuntu:18.04 +FROM ubuntu:18.04 AS os-dependencies -# Install system dependencies and Python packages +# Install system dependencies. RUN apt-get update -y && \ apt-get -y install libimage-exiftool-perl FROM continuumio/miniconda3:4.10.3p0-alpine -COPY . . +# From the previous image, copy exiftool into this image. +COPY --from=os-dependencies /usr/bin/exiftool /usr/bin/exiftool -# Get the arguments from the docker-compose environment +# Add the local code to the /app directory and set it to be the working directory. +# Since we mount the /app directory as a volume in docker-compose.yml, this +# allows us to automatically update the code in the Docker image when it's changed. +ADD . /app +WORKDIR /app + +# Get the arguments from the docker-compose environment. ARG PORT EXPOSE ${PORT} -# This allows us to use the arguments during runtime +# Create the conda environment. RUN conda env create -f environment.yml # Use the conda environment we created to run the application. -# The docker execution process run conda activate semantic-search, since the lifetime of the environment would only be for the single command. -# Instead, we'll use the conda run to run the application. -# Use 0.0.0.0 to explicitly set the host ip for the service on the container. https://pythonspeed.com/articles/docker-connection-refused/ -# Use sh -c to start a shell in order to use environment variables in CMD. +# To enable the conda env, we cannot simply RUN `conda activate semantic-search`, +# since each RUN command in a Dockerfile is a separate bash shell. +# The environment would not carry forward. +# Instead, we'll use `conda run` to run the application. +# There are more arguments required for the script to run, +# but these should be passed in through the docker-compose.yml file. ENTRYPOINT ["conda", "run", "--no-capture-output", "--name", "semantic-search", \ "python3", "-m", "src.main"] - - # "python3", "-m", "src.main", "-c=${CONFIG_FILE}", "-vv" ,"--host=${HOST}, "--port=${PORT}"] - -# CMD ["sh", "-c", "echo ${CONFIG_FILE}", "echo ${HOST}", "echo ${PORT}"] From 9802023c79f9b7c35985da2da9e69d80bb1313d0 Mon Sep 17 00:00:00 2001 From: Saba Date: Mon, 24 Jan 2022 14:10:18 -0500 Subject: [PATCH 5/9] Clean up docker-compose - Mount the local directory to /app - Reformat the file paths to generically indicate what their purpose is - Add comments to assist users who wasnt to modify properties themselves --- docker-compose.yml | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index d64f1735..71c61286 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,15 +3,30 @@ services: server: build: context: . + dockerfile: Dockerfile args: - PORT=8000 ports: + # If changing the local port (left hand side), no other changes required. + # If changing the remote port (right hand side), + # change the port in the args in the build section, + # as well as the port in the command section to match - "8000:8000" + working_dir: /app volumes: - - .:/code - - /home/saba/notes/:/data/notes/ - - /home/saba/embeddings/:/data/generated/ - - /home/saba/images/:/data/images/ - - /home/saba/ledger/:/data/ledger/ - - /home/saba/music/:/data/music/ + - .:/app + # These mounted volumes hold the raw data that should be indexed for search. + # The path in your local directory (left hand side) + # points to the files you want to index. + # The path of the mounted directory (right hand side), + # must match the path prefix in your config file. + - /path/to/notes/:/data/notes/ + - /path/to/photos/:/data/images/ + - /path/to/ledger/:/data/ledger/ + - /path/to/music/:/data/music/ + # It's ok if you don't have existing embeddings. + # Leave the line as is - an empty volume will be created if it doesn't exist. + - /path/to/embeddings/:/data/generated/ + + # Use 0.0.0.0 to explicitly set the host ip for the service on the container. https://pythonspeed.com/articles/docker-connection-refused/ command: --host="0.0.0.0" --port=8000 -c=docker_sample_config.yml From 9fb410fc2576b5845e88d668cc7338c1034921b0 Mon Sep 17 00:00:00 2001 From: Saba Date: Mon, 24 Jan 2022 14:11:38 -0500 Subject: [PATCH 6/9] Clean up docker_sample_config.yml - Uncomment other search types - Explain the file prefixes behavior and how it interfaces with the docker image --- docker_sample_config.yml | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/docker_sample_config.yml b/docker_sample_config.yml index 8abe0209..4b5ad021 100644 --- a/docker_sample_config.yml +++ b/docker_sample_config.yml @@ -1,4 +1,7 @@ content-type: + # The /data/folder/ prefix to the folders is here because this is + # the directory to which the local files are copied in the docker-compose. + # If changing, the docker-compose volumes should also be changed to match. org: input-files: null input-filter: "/data/notes/*.org" @@ -6,22 +9,22 @@ content-type: embeddings-file: "/data/generated/.note_embeddings.pt" ledger: - # input-files: null - # input-filter: /data/ledger/*.beancount - # compressed-jsonl: /data/generated/.transactions.jsonl.gz - # embeddings-file: /data/generated/.transaction_embeddings.pt + input-files: null + input-filter: /data/ledger/*.beancount + compressed-jsonl: /data/generated/.transactions.jsonl.gz + embeddings-file: /data/generated/.transaction_embeddings.pt image: - # input-directory: "/data/images/" - # embeddings-file: "/data/generated/.image_embeddings.pt" - # batch-size: 50 - # use-xmp-metadata: "no" + input-directory: "/data/images/" + embeddings-file: "/data/generated/.image_embeddings.pt" + batch-size: 50 + use-xmp-metadata: "no" music: - # input-files: null - # input-filter: "/data/music/*.org" - # compressed-jsonl: "/data/generated/.songs.jsonl.gz" - # embeddings-file: "/data/generated/.song_embeddings.pt" + input-files: null + input-filter: "/data/music/*.org" + compressed-jsonl: "/data/generated/.songs.jsonl.gz" + embeddings-file: "/data/generated/.song_embeddings.pt" search-type: symmetric: From 33bc62dc19f08fb73002d4dd123a992893684362 Mon Sep 17 00:00:00 2001 From: Saba Date: Mon, 24 Jan 2022 21:53:26 -0500 Subject: [PATCH 7/9] Fix type of use_xmp_metadata to be bool, rather than str --- docker_sample_config.yml | 2 +- sample_config.yml | 2 +- src/utils/rawconfig.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docker_sample_config.yml b/docker_sample_config.yml index 4b5ad021..e9e23b75 100644 --- a/docker_sample_config.yml +++ b/docker_sample_config.yml @@ -18,7 +18,7 @@ content-type: input-directory: "/data/images/" embeddings-file: "/data/generated/.image_embeddings.pt" batch-size: 50 - use-xmp-metadata: "no" + use-xmp-metadata: true music: input-files: null diff --git a/sample_config.yml b/sample_config.yml index 8d5de409..8805c984 100644 --- a/sample_config.yml +++ b/sample_config.yml @@ -15,7 +15,7 @@ content-type: input-directory: "tests/data" embeddings-file: "tests/data/.image_embeddings.pt" batch-size: 50 - use-xmp-metadata: "no" + use-xmp-metadata: false music: input-files: ["tests/data/music.org"] diff --git a/src/utils/rawconfig.py b/src/utils/rawconfig.py index e87c22fd..4a88eb4e 100644 --- a/src/utils/rawconfig.py +++ b/src/utils/rawconfig.py @@ -20,7 +20,7 @@ class TextContentConfig(ConfigBase): embeddings_file: Optional[Path] class ImageContentConfig(ConfigBase): - use_xmp_metadata: Optional[str] + use_xmp_metadata: Optional[bool] batch_size: Optional[int] input_directory: Optional[Path] input_filter: Optional[str] From 52e701b3c263c65b187b7e7b459168decff4d414 Mon Sep 17 00:00:00 2001 From: Saba Date: Mon, 24 Jan 2022 21:54:10 -0500 Subject: [PATCH 8/9] Simplify Dockerfile by removing multibuild - Install exiftool dependency directly in the miniconda image --- Dockerfile | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index ccc76307..6ec26377 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,15 +1,10 @@ # syntax=docker/dockerfile:1 -FROM ubuntu:18.04 AS os-dependencies +FROM continuumio/miniconda3:latest # Install system dependencies. RUN apt-get update -y && \ apt-get -y install libimage-exiftool-perl -FROM continuumio/miniconda3:4.10.3p0-alpine - -# From the previous image, copy exiftool into this image. -COPY --from=os-dependencies /usr/bin/exiftool /usr/bin/exiftool - # Add the local code to the /app directory and set it to be the working directory. # Since we mount the /app directory as a volume in docker-compose.yml, this # allows us to automatically update the code in the Docker image when it's changed. From 1ba7fa66e5a442d1b18b5f30e89146c4cdcb94a7 Mon Sep 17 00:00:00 2001 From: Saba Date: Fri, 28 Jan 2022 23:20:50 -0500 Subject: [PATCH 9/9] Update README and default folders in docker_sample_config.yml - Add instruction to using Docker with README - Use the ./tests/data folder in docker_sample_conifg.yml so it can work right away for users --- README.org | 88 +++++++++++++++++++++++++++++++--------------- docker-compose.yml | 14 ++++---- 2 files changed, 66 insertions(+), 36 deletions(-) diff --git a/README.org b/README.org index 350e8c46..f8f4e15f 100644 --- a/README.org +++ b/README.org @@ -5,32 +5,52 @@ All data is processed locally. User can interface with semantic-search app via [[./src/interface/emacs/semantic-search.el][Emacs]], API or Commandline -** Dependencies - - Python3 - - [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]] +** Setup -** Install - #+begin_src shell - git clone https://github.com/debanjum/semantic-search && cd semantic-search - conda env create -f environment.yml - conda activate semantic-search - #+end_src +*** Setup using Docker -*** Install Environmental Dependencies - #+begin_src shell - sudo apt-get -y install libimage-exiftool-perl - #+end_src +**** 1. Clone Repository + #+begin_src shell + git clone https://github.com/debanjum/semantic-search && cd semantic-search + #+end_src -** Configure - Configure application search types and their underlying data source/files in ~sample_config.yml~ - Use the ~sample_config.yml~ as reference +**** 2. Configure + Add Content Directories for Semantic Search to Docker-Compose + Update [[./docker-compose.yml][docker-compose.yml]] to mount your images, org-mode notes, ledger/beancount directories + If required, edit config settings in [[./docker_sample_config.yml][docker_sample_config.yml]]. -** Run - Load ML model, generate embeddings and expose API to query notes, images, transactions etc specified in config YAML +**** 3. Run + #+begin_src shell + docker-compose up -d + #+end_src - #+begin_src shell - python3 -m src.main -c=sample_config.yml -vv - #+end_src +*** Setup on Local Machine + +**** 1. Install Dependencies + 1. Install Python3 [Required[ + 2. [[https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html][Install Conda]] [Required] + 3. Install Exiftool [Optional] + #+begin_src shell + sudo apt-get -y install libimage-exiftool-perl + #+end_src + +**** 2. Install Semantic Search + #+begin_src shell + git clone https://github.com/debanjum/semantic-search && cd semantic-search + conda env create -f environment.yml + conda activate semantic-search + #+end_src + +**** 3. Configure + Configure application search types and their underlying data source/files in ~sample_config.yml~ + Use the ~sample_config.yml~ as reference + +**** 4. Run + Load ML model, generate embeddings and expose API to query notes, images, transactions etc specified in config YAML + + #+begin_src shell + python3 -m src.main -c=sample_config.yml -vv + #+end_src ** Use - *Semantic Search via Emacs* @@ -39,19 +59,29 @@ - *Semantic Search via API* - Query: ~GET~ [[http://localhost:8000/search?q=%22what%20is%20the%20meaning%20of%20life%22][http://localhost:8000/search?q="What is the meaning of life"&t=notes]] - - Regenerate Embeddings: ~GET~ [[http://localhost:8000/regenerate][http://localhost:8000/regenerate?t=image]] + - Regenerate Embeddings: ~GET~ [[http://localhost:8000/regenerate][http://localhost:8000/regenerate]] - [[http://localhost:8000/docs][Semantic Search API Docs]] + - *UI to Edit Config* + - [[https://localhost:8000/ui][Config UI]] + ** Upgrade - #+begin_src shell - cd semantic-search - git pull origin master - conda env update -f environment.yml - conda activate semantic-search - #+end_src + +*** Using Docker + #+begin_src shell + docker-compose up + #+end_src + +*** On Local Machine + #+begin_src shell + cd semantic-search + git pull origin master + conda env update -f environment.yml + conda activate semantic-search + #+end_src ** Acknowledgments - [[https://huggingface.co/sentence-transformers/msmarco-MiniLM-L-6-v3][MiniLM Model]] for Asymmetric Text Search. See [[https://www.sbert.net/examples/applications/retrieve_rerank/README.html][SBert Documentation]] - [[https://github.com/openai/CLIP][OpenAI CLIP Model]] for Image Search. See [[https://www.sbert.net/examples/applications/image-search/README.html][SBert Documentation]] - Charles Cave for [[http://members.optusnet.com.au/~charles57/GTD/orgnode.html][OrgNode Parser]] - - Sven Marnach for [[https://github.com/smarnach/pyexiftool/blob/master/exiftool.py][PyExifTool]] + - Sven Marnach for [[https://github.com/smarnach/pyexiftool/blob/master/exiftool.py][PyExifTool]] \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 71c61286..f0cd1db8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -20,13 +20,13 @@ services: # points to the files you want to index. # The path of the mounted directory (right hand side), # must match the path prefix in your config file. - - /path/to/notes/:/data/notes/ - - /path/to/photos/:/data/images/ - - /path/to/ledger/:/data/ledger/ - - /path/to/music/:/data/music/ + - ./tests/data/:/data/notes/ + - ./tests/data/:/data/images/ + - ./tests/data/:/data/ledger/ + - ./tests/data/:/data/music/ # It's ok if you don't have existing embeddings. - # Leave the line as is - an empty volume will be created if it doesn't exist. - - /path/to/embeddings/:/data/generated/ + # You can set this volume to point to an empty folder. + - ./tests/data/:/data/generated/ # Use 0.0.0.0 to explicitly set the host ip for the service on the container. https://pythonspeed.com/articles/docker-connection-refused/ - command: --host="0.0.0.0" --port=8000 -c=docker_sample_config.yml + command: --host="0.0.0.0" --port=8000 -c=docker_sample_config.yml -vv