Dockerfile to build smaller Images (#567)

This commit is contained in:
Yuhong Sun
2023-10-12 13:08:47 -07:00
committed by GitHub
parent 41964031bf
commit dbf59d2acc
2 changed files with 28 additions and 36 deletions

View File

@@ -1,53 +1,51 @@
FROM python:3.11.4-slim-bookworm FROM python:3.11.4-slim-bookworm
RUN apt-get update \ # Install system dependencies
&& apt-get install -y git cmake pkg-config libprotobuf-c-dev protobuf-compiler \ RUN apt-get update && \
apt-get install -y git cmake pkg-config libprotobuf-c-dev protobuf-compiler \
libprotobuf-dev libgoogle-perftools-dev libpq-dev build-essential cron curl \ libprotobuf-dev libgoogle-perftools-dev libpq-dev build-essential cron curl \
supervisor zip \ supervisor zip ca-certificates gnupg && \
&& rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/* && \
apt-get clean
COPY ./requirements/default.txt /tmp/requirements.txt
RUN pip install --no-cache-dir --upgrade -r /tmp/requirements.txt
# Install Python dependencies
# Remove py which is pulled in by retry, py is not needed and is a CVE # Remove py which is pulled in by retry, py is not needed and is a CVE
RUN pip uninstall -y py COPY ./requirements/default.txt /tmp/requirements.txt
RUN pip install --no-cache-dir --upgrade -r /tmp/requirements.txt && \
pip uninstall -y py && \
playwright install chromium && \
playwright install-deps chromium
RUN playwright install chromium # install nodejs and replace nodejs packaged with playwright (18.17.0) with the one installed below
RUN playwright install-deps chromium
# install nodejs and replace nodejs packaged with playwright (18.17.0) with the one installed below
# based on the instructions found here: # based on the instructions found here:
# https://nodejs.org/en/download/package-manager#debian-and-ubuntu-based-linux-distributions # https://nodejs.org/en/download/package-manager#debian-and-ubuntu-based-linux-distributions
# this is temporarily needed until playwright updates their packaged node version to # this is temporarily needed until playwright updates their packaged node version to
# 20.5.1+ # 20.5.1+
RUN apt-get update RUN mkdir -p /etc/apt/keyrings && \
RUN apt-get install -y ca-certificates curl gnupg curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg && \
RUN mkdir -p /etc/apt/keyrings echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" | tee /etc/apt/sources.list.d/nodesource.list && \
RUN curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg apt-get update && \
RUN echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" | tee /etc/apt/sources.list.d/nodesource.list apt-get install -y nodejs && \
RUN apt-get update cp /usr/bin/node /usr/local/lib/python3.11/site-packages/playwright/driver/node && \
RUN apt-get install nodejs -y apt-get remove -y nodejs
# replace nodejs packaged with playwright (18.17.0) with the one installed above
RUN cp /usr/bin/node /usr/local/lib/python3.11/site-packages/playwright/driver/node
# remove nodejs (except for the binary we moved into playwright)
RUN apt-get remove -y nodejs
# Cleanup for CVEs and size reduction # Cleanup for CVEs and size reduction
RUN apt-get remove -y linux-libc-dev \
&& apt-get autoremove -y \
&& rm -rf /var/lib/apt/lists/*
# Remove tornado test key to placate vulnerability scanners # Remove tornado test key to placate vulnerability scanners
# More details can be found here: # More details can be found here:
# https://github.com/tornadoweb/tornado/issues/3107 # https://github.com/tornadoweb/tornado/issues/3107
RUN rm /usr/local/lib/python3.11/site-packages/tornado/test/test.key RUN apt-get remove -y linux-libc-dev && \
apt-get autoremove -y && \
rm -rf /var/lib/apt/lists/* && \
rm /usr/local/lib/python3.11/site-packages/tornado/test/test.key
# Set up application files
WORKDIR /app WORKDIR /app
COPY ./danswer /app/danswer COPY ./danswer /app/danswer
COPY ./alembic /app/alembic COPY ./alembic /app/alembic
COPY ./alembic.ini /app/alembic.ini COPY ./alembic.ini /app/alembic.ini
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
# Create Vespa app zip
WORKDIR /app/danswer/datastores/vespa/app_config WORKDIR /app/danswer/datastores/vespa/app_config
RUN zip -r /app/danswer/vespa-app.zip . RUN zip -r /app/danswer/vespa-app.zip .
WORKDIR /app WORKDIR /app
@@ -57,5 +55,6 @@ COPY ./scripts/migrate_vespa_to_acl.py /app/migrate_vespa_to_acl.py
ENV PYTHONPATH /app ENV PYTHONPATH /app
# By default this container does nothing, it is used by api server and background which specify their own CMD # Default command which does nothing
# This container is used by api server and background which specify their own CMD
CMD ["tail", "-f", "/dev/null"] CMD ["tail", "-f", "/dev/null"]

View File

@@ -37,7 +37,6 @@ from danswer.db.models import Connector
from danswer.db.models import IndexAttempt from danswer.db.models import IndexAttempt
from danswer.db.models import IndexingStatus from danswer.db.models import IndexingStatus
from danswer.search.search_utils import warm_up_models from danswer.search.search_utils import warm_up_models
from danswer.utils.acl import set_acl_for_vespa_nonblocking
from danswer.utils.logger import IndexAttemptSingleton from danswer.utils.logger import IndexAttemptSingleton
from danswer.utils.logger import setup_logger from danswer.utils.logger import setup_logger
@@ -449,12 +448,6 @@ def update_loop(delay: int = 10, num_workers: int = NUM_INDEXING_WORKERS) -> Non
# This ensures that bad states get cleaned up # This ensures that bad states get cleaned up
mark_all_in_progress_cc_pairs_failed(db_session) mark_all_in_progress_cc_pairs_failed(db_session)
# TODO: remove this once everyone is migrated to ACL
# does nothing if this has been successfully run before
# NOTE: is done in another thread, to not block indexing runs from
# getting kicked off
set_acl_for_vespa_nonblocking(should_check_if_already_done=True)
while True: while True:
start = time.time() start = time.time()
start_time_utc = datetime.utcfromtimestamp(start).strftime("%Y-%m-%d %H:%M:%S") start_time_utc = datetime.utcfromtimestamp(start).strftime("%Y-%m-%d %H:%M:%S")