diff --git a/.devcontainer/.env b/.devcontainer/.env index 5db7736..289e0d3 100644 --- a/.devcontainer/.env +++ b/.devcontainer/.env @@ -20,17 +20,17 @@ PIP_ROOT_USER_ACTION=ignore WORKSPACE_DIR=/workspace # Path to virtual environment created/used by post_create.sh. -VENV_PATH=/workspace/.venv +VENV_PATH=$WORKSPACE_DIR/.venv # Requirements file installed by post_create.sh (if file exists). -REQUIREMENTS_FILE=/workspace/requirements.txt +REQUIREMENTS_FILE=$WORKSPACE_DIR/requirements.txt # User used by VS Code remote server inside container (e.g. root). -DEVCONTAINER_RUN_USER=root +DEVCONTAINER_RUN_USER=vscode # UID for app user created in project Dockerfile (non-devcontainer image build). APP_USER_ID=1000 # Home directory for REMOTE_USER inside container. -REMOTE_HOME=/root +REMOTE_HOME=/home/${DEVCONTAINER_RUN_USER} diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index dddaf10..2010f8a 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -1,34 +1,55 @@ -# Pouzivame oficialny Python image a build argy mapujeme na .devcontainer/.env. -ARG PYTHON_BASE=3.13 -ARG PIP_INDEX_URL=https://dv.masara.eu/repository/pypi-group/simple -FROM python:${PYTHON_BASE}-slim - -# Re-declare build args for this stage. -ARG PIP_INDEX_URL - -ENV PYTHONUNBUFFERED=1 \ - PIP_NO_CACHE_DIR=1 \ - PIP_DISABLE_PIP_VERSION_CHECK=1 \ - PIP_INDEX_URL=${PIP_INDEX_URL} +# Pouzivame oficialny Python image a build argy mapujeme na .devcontainer/.env. +ARG PYTHON_BASE=3.13 +ARG PIP_INDEX_URL=https://dv.masara.eu/repository/pypi-group/simple -# Základ + Docker repo + docker-ce-cli + compose plugin + git +FROM python:${PYTHON_BASE}-slim + +# Re-declare build args for this stage. +ARG PIP_INDEX_URL + +ENV PYTHONUNBUFFERED=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + PIP_INDEX_URL=${PIP_INDEX_URL} + +# System dependencies + Docker CLI RUN set -eux; \ apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - ca-certificates curl gnupg && \ - install -m 0755 -d /etc/apt/keyrings && \ - curl -fsSL https://download.docker.com/linux/debian/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg && \ - chmod a+r /etc/apt/keyrings/docker.gpg && \ - echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] \ - https://download.docker.com/linux/debian $(. /etc/os-release && echo $VERSION_CODENAME) stable" \ - > /etc/apt/sources.list.d/docker.list && \ - apt-get update && \ - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - build-essential pkg-config \ + ca-certificates \ + curl \ + gnupg \ git \ - docker-ce-cli docker-compose-plugin docker-buildx-plugin \ - libatomic1 libstdc++6 libgcc-s1 \ + build-essential \ + pkg-config \ procps \ + libatomic1 \ + libstdc++6 \ + libgcc-s1 \ + libxcb1 \ + libgl1 \ + libglib2.0-0 \ + libsm6 \ + libxext6 \ + libxrender1 \ + && install -m 0755 -d /etc/apt/keyrings \ + && curl -fsSL https://download.docker.com/linux/debian/gpg \ + | gpg --dearmor -o /etc/apt/keyrings/docker.gpg \ + && chmod a+r /etc/apt/keyrings/docker.gpg \ + && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] \ + https://download.docker.com/linux/debian \ + $(. /etc/os-release && echo $VERSION_CODENAME) stable" \ + > /etc/apt/sources.list.d/docker.list \ + && apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + docker-ce-cli \ + docker-compose-plugin \ + docker-buildx-plugin \ && rm -rf /var/lib/apt/lists/* +# Vytvorenie non-root usera pre VS Code/devcontainer workflow. +RUN useradd -ms /bin/bash -u 1000 vscode + WORKDIR /workspace + +USER vscode + diff --git a/.devcontainer/bashrc b/.devcontainer/bashrc new file mode 100644 index 0000000..0460c09 --- /dev/null +++ b/.devcontainer/bashrc @@ -0,0 +1,13 @@ +# Project shell startup for VS Code integrated terminals. + +if [ -f /etc/bash.bashrc ]; then + source /etc/bash.bashrc +fi + +if [ -f /home/vscode/.bashrc ]; then + source /home/vscode/.bashrc +fi + +if [ -f /workspace/.venv/bin/activate ]; then + source /workspace/.venv/bin/activate +fi diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index f2e7703..668f3b5 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,22 +1,25 @@ { - "name": "Python Dev", - "dockerComposeFile": "docker-compose.yml", - "service": "dev", + "name": "dev-python313", + "build": { + "dockerfile": "Dockerfile" + }, "workspaceFolder": "/workspace", - "overrideCommand": true, + "remoteUser": "vscode", + "updateRemoteUserUID": true, + "runArgs": [ + "--group-add=1000" + ], "customizations": { "vscode": { "extensions": [ "ms-python.python", "ms-python.vscode-pylance", - "ms-python.autopep8", "ms-toolsai.jupyter", "ms-python.black-formatter", - "ms-python.isort", + "charliermarsh.ruff", "codezombiech.gitignore", "davidanson.vscode-markdownlint", "ms-azuretools.vscode-docker", - "docker.docker", "openai.chatgpt", "continue.continue" ], @@ -29,17 +32,34 @@ "python.analysis.diagnosticMode": "workspace", "editor.formatOnSave": true, "python.terminal.activateEnvironment": true, - "python.defaultInterpreterPath": "${env:VENV_PATH}/bin/python", + "python.defaultInterpreterPath": "/workspace/.venv/bin/python", "remote.restoreForwardedPorts": false, "remote.autoForwardPortsSource": "output", - "debug.javascript.autoAttachFilter": "disabled" + "debug.javascript.autoAttachFilter": "disabled", + "terminal.integrated.env.linux": { + "VIRTUAL_ENV": "/workspace/.venv", + "PATH": "/workspace/.venv/bin:${env:PATH}" + }, + "terminal.integrated.defaultProfile.linux": "dev-python313 venv", + "terminal.integrated.profiles.linux": { + "DocExtractor venv": { + "path": "bash", + "args": [ + "--rcfile", + "/workspace/.devcontainer/bashrc" + ] + } + } } } }, "mounts": [ - "type=bind,source=/var/run/docker.sock,target=/var/run/docker.sock" + "source=${localWorkspaceFolder},target=/workspace,type=bind", + "source=/var/run/docker.sock,target=/var/run/docker.sock,type=bind", + "source=/home/jojo/.devcontainer-home,target=/home/vscode,type=bind" ], "postCreateCommand": "bash .devcontainer/post_create.sh", + "postStartCommand": "bash .devcontainer/post_start.sh", "forwardPorts": [], "portsAttributes": { "*": { diff --git a/.devcontainer/post_create.sh b/.devcontainer/post_create.sh index 781c66d..bc14594 100644 --- a/.devcontainer/post_create.sh +++ b/.devcontainer/post_create.sh @@ -4,43 +4,23 @@ set -euo pipefail WORKSPACE_DIR="${WORKSPACE_DIR:-/workspace}" VENV_PATH="${VENV_PATH:-$WORKSPACE_DIR/.venv}" REQUIREMENTS_FILE="${REQUIREMENTS_FILE:-$WORKSPACE_DIR/requirements.txt}" -# Resolve current user's home robustly (works for root and non-root users). -USER_HOME="${HOME:-}" -if [ -z "$USER_HOME" ] && command -v getent >/dev/null 2>&1; then - USER_HOME="$(getent passwd "$(id -un)" | cut -d: -f6)" -fi -if [ -z "$USER_HOME" ]; then - USER_HOME="/root" -fi -BASHRC_PATH="${BASHRC_PATH:-$USER_HOME/.bashrc}" -# Ak venv neexistuje, vytvor ho a priprav pip +echo "Workspace: $WORKSPACE_DIR" +echo "Venv: $VENV_PATH" + if [ ! -d "$VENV_PATH" ]; then - echo "Virtuálne prostredie neexistuje – vytváram..." + echo "Vytváram virtuálne prostredie..." python -m venv "$VENV_PATH" -else - echo "Používam existujúce virtuálne prostredie..." fi -echo "Aktivujem virtuálne prostredie..." # shellcheck disable=SC1090 source "$VENV_PATH/bin/activate" -echo "Aktualizujem pip a základné nástroje..." -python -m pip install --upgrade pip setuptools wheel +echo "Aktualizujem pip tooling..." +python -m pip install --upgrade pip "setuptools==81.0.0" wheel -# Inštalácia závislostí, ak existuje requirements.txt if [ -f "$REQUIREMENTS_FILE" ]; then - echo "Inštalujem závislosti z requirements.txt..." + echo "Inštalujem requirements.txt..." python -m pip install -r "$REQUIREMENTS_FILE" -else - echo "requirements.txt nenájdený – preskakujem inštaláciu závislostí." -fi - -# Ak sa terminál otvorí skôr, než Python extension stihne auto-aktiváciu, -# zabezpečíme aktiváciu aj cez shell init. -ACTIVATE_LINE="[ -f \"$VENV_PATH/bin/activate\" ] && source \"$VENV_PATH/bin/activate\"" -touch "$BASHRC_PATH" -if ! grep -Fq "$ACTIVATE_LINE" "$BASHRC_PATH"; then - echo "$ACTIVATE_LINE" >> "$BASHRC_PATH" fi +echo "--- Konfigurácia po vytvorení virtuálneho prostredia je dokončená." \ No newline at end of file diff --git a/.devcontainer/post_start.sh b/.devcontainer/post_start.sh new file mode 100644 index 0000000..1364b68 --- /dev/null +++ b/.devcontainer/post_start.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +set -euo pipefail + +WORKSPACE_DIR="${WORKSPACE_DIR:-/workspace}" +VENV_PATH="${VENV_PATH:-$WORKSPACE_DIR/.venv}" + +echo "Container started." + +# Self-healing fallback. +if [ ! -d "$VENV_PATH" ]; then + echo ".venv neexistuje — vytváram fallback virtualenv..." + + python -m venv "$VENV_PATH" + + # shellcheck disable=SC1090 + source "$VENV_PATH/bin/activate" + + python -m pip install --upgrade pip "setuptools==81.0.0" wheel +else + echo ".venv existuje." +fi + +# Docker diagnostika. +if docker version >/dev/null 2>&1; then + echo "Docker dostupný." +else + echo "Docker NIE JE dostupný." + if [ -S /var/run/docker.sock ]; then + echo "Docker socket: $(stat -c '%U:%G %a %n' /var/run/docker.sock)" + echo "Používateľ: $(id)" + else + echo "Docker socket /var/run/docker.sock nie je dostupný." + fi +fi diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..6deafc2 --- /dev/null +++ b/.flake8 @@ -0,0 +1,2 @@ +[flake8] +max-line-length = 120 diff --git a/.gitignore b/.gitignore index b9fd710..81646e0 100644 --- a/.gitignore +++ b/.gitignore @@ -231,4 +231,8 @@ __marimo__/ # Track devcontainer version source of truth !.devcontainer/.env -.devcontainer/.codex +# Real-document fixtures (large/sensitive test inputs) +tests/fixtures/real_docs/ + +# Local cache for Docling/HuggingFace models +data/models/ diff --git a/.vscode/settings.json b/.vscode/settings.json index eeabe64..ff6c19e 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,4 +1,22 @@ { - "python.defaultInterpreterPath": "${workspaceFolder}/.venv/bin/python", - "python.terminal.activateEnvironment": true -} \ No newline at end of file + "terminal.integrated.env.linux": { + "VIRTUAL_ENV": "/workspace/.venv", + "PATH": "/workspace/.venv/bin:${env:PATH}" + }, + "terminal.integrated.defaultProfile.linux": "DocExtractor venv", + "terminal.integrated.profiles.linux": { + "DocExtractor venv": { + "path": "bash", + "args": [ + "--rcfile", + "/workspace/.devcontainer/bashrc" + ] + } + }, + "python.defaultInterpreterPath": "/workspace/.venv/bin/python", + "python.testing.pytestEnabled": true, + "python.testing.unittestEnabled": false, + "python.testing.pytestArgs": [ + "tests" + ] +}