First big push
This commit is contained in:
21
Dockerfile
Normal file
21
Dockerfile
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
FROM jbarlow83/ocrmypdf:latest
|
||||||
|
|
||||||
|
# Install packages, including tzdata for timezone support
|
||||||
|
RUN apt-get update && apt-get install -y python3-flask python3-pip zip tzdata
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Copy application code and entrypoint script
|
||||||
|
COPY app /app
|
||||||
|
COPY entrypoint.sh /app/entrypoint.sh
|
||||||
|
RUN chmod +x /app/entrypoint.sh
|
||||||
|
|
||||||
|
# Install gunicorn using pip with the flag to override external management
|
||||||
|
RUN pip3 install --break-system-packages gunicorn
|
||||||
|
|
||||||
|
# Expose the application port
|
||||||
|
EXPOSE 5000
|
||||||
|
|
||||||
|
# Override the base image's ENTRYPOINT with our custom script
|
||||||
|
ENTRYPOINT ["/app/entrypoint.sh"]
|
||||||
|
CMD []
|
||||||
120
app/app.py
Normal file
120
app/app.py
Normal file
@@ -0,0 +1,120 @@
|
|||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
import uuid
|
||||||
|
from flask import Flask, request, render_template, send_file, jsonify
|
||||||
|
from werkzeug.utils import secure_filename
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
BASE_UPLOAD_FOLDER = 'uploads'
|
||||||
|
BASE_OUTPUT_FOLDER = 'output'
|
||||||
|
os.makedirs(BASE_UPLOAD_FOLDER, exist_ok=True)
|
||||||
|
os.makedirs(BASE_OUTPUT_FOLDER, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup_session_folders(session_id):
|
||||||
|
"""Remove any existing session folders to ensure a clean slate."""
|
||||||
|
upload_folder = os.path.join(BASE_UPLOAD_FOLDER, session_id)
|
||||||
|
output_folder = os.path.join(BASE_OUTPUT_FOLDER, session_id)
|
||||||
|
if os.path.exists(upload_folder):
|
||||||
|
shutil.rmtree(upload_folder)
|
||||||
|
if os.path.exists(output_folder):
|
||||||
|
shutil.rmtree(output_folder)
|
||||||
|
|
||||||
|
|
||||||
|
def create_session_folders(session_id):
|
||||||
|
"""
|
||||||
|
Clears previous session folders if they exist and creates new ones.
|
||||||
|
This ensures that files from previous uploads won't mix with new ones.
|
||||||
|
"""
|
||||||
|
cleanup_session_folders(session_id)
|
||||||
|
upload_folder = os.path.join(BASE_UPLOAD_FOLDER, session_id)
|
||||||
|
output_folder = os.path.join(BASE_OUTPUT_FOLDER, session_id)
|
||||||
|
os.makedirs(upload_folder, exist_ok=True)
|
||||||
|
os.makedirs(output_folder, exist_ok=True)
|
||||||
|
return upload_folder, output_folder
|
||||||
|
|
||||||
|
|
||||||
|
def run_ocrmypdf(input_pdf, output_pdf):
|
||||||
|
cmd = [
|
||||||
|
'ocrmypdf',
|
||||||
|
'--rotate-pages',
|
||||||
|
'--deskew',
|
||||||
|
'--output-type', 'pdfa',
|
||||||
|
'--jobs', '4',
|
||||||
|
input_pdf,
|
||||||
|
output_pdf
|
||||||
|
]
|
||||||
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||||
|
if result.returncode != 0:
|
||||||
|
if "PriorOcrFoundError" in result.stderr:
|
||||||
|
raise ValueError("This file already contains selectable text and was skipped.")
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"OCRmyPDF failed: {result.stderr}")
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/')
|
||||||
|
def index():
|
||||||
|
# Generate a new session id for each visitor.
|
||||||
|
session_id = str(uuid.uuid4())
|
||||||
|
return render_template('index.html', session_id=session_id)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/upload/<session_id>', methods=['POST'])
|
||||||
|
def upload_files(session_id):
|
||||||
|
# Create fresh session folders, clearing any previous data.
|
||||||
|
upload_folder, output_folder = create_session_folders(session_id)
|
||||||
|
|
||||||
|
if 'files' not in request.files:
|
||||||
|
return jsonify({'error': 'No files were uploaded.'}), 400
|
||||||
|
|
||||||
|
files = request.files.getlist('files')
|
||||||
|
processed_files = []
|
||||||
|
skipped_files = []
|
||||||
|
|
||||||
|
for file in files:
|
||||||
|
if file.filename == '':
|
||||||
|
continue
|
||||||
|
|
||||||
|
filename = secure_filename(file.filename)
|
||||||
|
input_path = os.path.join(upload_folder, filename)
|
||||||
|
output_path = os.path.join(output_folder, filename)
|
||||||
|
|
||||||
|
file.save(input_path)
|
||||||
|
|
||||||
|
try:
|
||||||
|
run_ocrmypdf(input_path, output_path)
|
||||||
|
processed_files.append(filename)
|
||||||
|
except ValueError:
|
||||||
|
skipped_files.append(filename)
|
||||||
|
except Exception as e:
|
||||||
|
# If there's an error, cleanup the session folders
|
||||||
|
cleanup_session_folders(session_id)
|
||||||
|
return jsonify({'error': f"Failed to process {filename}: {str(e)}"}), 500
|
||||||
|
|
||||||
|
if not processed_files:
|
||||||
|
return jsonify({
|
||||||
|
'error': 'All files were skipped because they already contain selectable text.',
|
||||||
|
'skipped_files': skipped_files
|
||||||
|
})
|
||||||
|
|
||||||
|
# Create a zip file containing only the current upload's processed files.
|
||||||
|
zip_filename = os.path.join(output_folder, 'processed_files.zip')
|
||||||
|
subprocess.run(['zip', '-j', zip_filename] + [os.path.join(output_folder, f) for f in processed_files])
|
||||||
|
|
||||||
|
return jsonify({
|
||||||
|
'download_url': f'/download/{session_id}/processed_files.zip',
|
||||||
|
'processed_files': processed_files,
|
||||||
|
'skipped_files': skipped_files
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/download/<session_id>/<filename>')
|
||||||
|
def download_file(session_id, filename):
|
||||||
|
file_path = os.path.join(BASE_OUTPUT_FOLDER, session_id, filename)
|
||||||
|
return send_file(file_path, as_attachment=True)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
app.run(host='0.0.0.0', port=5000, debug=False)
|
||||||
7
app/templates/bootstrap.min.css
vendored
Normal file
7
app/templates/bootstrap.min.css
vendored
Normal file
File diff suppressed because one or more lines are too long
153
app/templates/index.html
Normal file
153
app/templates/index.html
Normal file
@@ -0,0 +1,153 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<title>OCRmyPDF WebUI</title>
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
|
||||||
|
<!-- Inline minimal CSS for styling (no external dependencies) -->
|
||||||
|
<style>
|
||||||
|
/* Basic reset */
|
||||||
|
* {
|
||||||
|
box-sizing: border-box;
|
||||||
|
margin: 0;
|
||||||
|
padding: 0;
|
||||||
|
}
|
||||||
|
body {
|
||||||
|
font-family: Arial, sans-serif;
|
||||||
|
background-color: #f8f9fa;
|
||||||
|
padding: 20px;
|
||||||
|
}
|
||||||
|
.container {
|
||||||
|
max-width: 800px;
|
||||||
|
margin: auto;
|
||||||
|
background: #fff;
|
||||||
|
padding: 20px;
|
||||||
|
border-radius: 0.3em;
|
||||||
|
box-shadow: 0 0 10px rgba(0,0,0,0.1);
|
||||||
|
}
|
||||||
|
h2 {
|
||||||
|
margin-bottom: 20px;
|
||||||
|
}
|
||||||
|
input[type="file"] {
|
||||||
|
margin-bottom: 10px;
|
||||||
|
}
|
||||||
|
.btn {
|
||||||
|
display: inline-block;
|
||||||
|
padding: 0.5rem 1rem;
|
||||||
|
font-size: 1rem;
|
||||||
|
font-weight: bold;
|
||||||
|
text-align: center;
|
||||||
|
color: #fff;
|
||||||
|
background-color: #007bff;
|
||||||
|
border: none;
|
||||||
|
border-radius: 0.25rem;
|
||||||
|
cursor: pointer;
|
||||||
|
text-decoration: none;
|
||||||
|
}
|
||||||
|
.btn:disabled {
|
||||||
|
opacity: 0.65;
|
||||||
|
cursor: not-allowed;
|
||||||
|
}
|
||||||
|
.alert {
|
||||||
|
padding: 0.75rem 1.25rem;
|
||||||
|
margin: 1rem 0;
|
||||||
|
border: 1px solid transparent;
|
||||||
|
border-radius: 0.25rem;
|
||||||
|
}
|
||||||
|
.alert-info {
|
||||||
|
color: #055160;
|
||||||
|
background-color: #cff4fc;
|
||||||
|
border-color: #b6effb;
|
||||||
|
}
|
||||||
|
.alert-success {
|
||||||
|
color: #0f5132;
|
||||||
|
background-color: #d1e7dd;
|
||||||
|
border-color: #badbcc;
|
||||||
|
}
|
||||||
|
.alert-danger {
|
||||||
|
color: #842029;
|
||||||
|
background-color: #f8d7da;
|
||||||
|
border-color: #f5c2c7;
|
||||||
|
}
|
||||||
|
.mt-2 {
|
||||||
|
margin-top: 0.5rem;
|
||||||
|
}
|
||||||
|
.mt-3 {
|
||||||
|
margin-top: 1rem;
|
||||||
|
}
|
||||||
|
.mt-5 {
|
||||||
|
margin-top: 3rem;
|
||||||
|
}
|
||||||
|
.mb-4 {
|
||||||
|
margin-bottom: 1.5rem;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
|
||||||
|
<!-- Inline JavaScript -->
|
||||||
|
<script>
|
||||||
|
async function uploadFiles(sessionId) {
|
||||||
|
const files = document.getElementById('fileInput').files;
|
||||||
|
const formData = new FormData();
|
||||||
|
const uploadBtn = document.getElementById('uploadBtn');
|
||||||
|
const status = document.getElementById('status');
|
||||||
|
const resultLink = document.getElementById('resultLink');
|
||||||
|
const processedList = document.getElementById('processedList');
|
||||||
|
const skippedList = document.getElementById('skippedList');
|
||||||
|
|
||||||
|
resultLink.style.display = 'none';
|
||||||
|
processedList.innerHTML = '';
|
||||||
|
skippedList.innerHTML = '';
|
||||||
|
status.innerHTML = '';
|
||||||
|
|
||||||
|
for (let i = 0; i < files.length; i++) {
|
||||||
|
formData.append('files', files[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
uploadBtn.disabled = true;
|
||||||
|
uploadBtn.innerHTML = 'Processing...';
|
||||||
|
status.innerHTML = '<div class="alert alert-info">Processing files, please wait...</div>';
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await fetch(`/upload/${sessionId}`, {
|
||||||
|
method: 'POST',
|
||||||
|
body: formData
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await response.json();
|
||||||
|
|
||||||
|
if (result.download_url) {
|
||||||
|
status.innerHTML = '<div class="alert alert-success">Processing complete! Download your files below.</div>';
|
||||||
|
resultLink.href = result.download_url;
|
||||||
|
resultLink.style.display = 'block';
|
||||||
|
|
||||||
|
if (result.processed_files && result.processed_files.length > 0) {
|
||||||
|
processedList.innerHTML = '<strong>Processed Files:</strong><br>' + result.processed_files.join('<br>');
|
||||||
|
}
|
||||||
|
if (result.skipped_files && result.skipped_files.length > 0) {
|
||||||
|
skippedList.innerHTML = '<strong>Skipped Files (Already Contain Text):</strong><br>' + result.skipped_files.join('<br>');
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
status.innerHTML = '<div class="alert alert-danger">' + (result.error || 'Unknown error occurred.') + '</div>';
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
status.innerHTML = '<div class="alert alert-danger">Error: ' + error.message + '</div>';
|
||||||
|
} finally {
|
||||||
|
uploadBtn.disabled = false;
|
||||||
|
uploadBtn.innerHTML = 'Upload and Process';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="container mt-5">
|
||||||
|
<h2 class="mb-4">OCRmyPDF WebUI</h2>
|
||||||
|
<input type="file" id="fileInput" multiple>
|
||||||
|
<button id="uploadBtn" class="btn" onclick="uploadFiles('{{ session_id }}')">Upload and Process</button>
|
||||||
|
<div id="status" class="mt-3"></div>
|
||||||
|
<div id="processedList" class="mt-2"></div>
|
||||||
|
<div id="skippedList" class="mt-2"></div>
|
||||||
|
<a id="resultLink" class="btn mt-3" href="#" style="display:none;">Download Processed Files (ZIP)</a>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
11
docker-compose.yaml
Normal file
11
docker-compose.yaml
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
services:
|
||||||
|
webui:
|
||||||
|
build: .
|
||||||
|
ports:
|
||||||
|
- "5000:5000"
|
||||||
|
# volumes:
|
||||||
|
# - ./uploads:/app/uploads
|
||||||
|
# - ./output:/app/output
|
||||||
|
environment:
|
||||||
|
- GUNICORN_TIMEOUT=300
|
||||||
|
- TZ=Australia/Sydney
|
||||||
13
entrypoint.sh
Executable file
13
entrypoint.sh
Executable file
@@ -0,0 +1,13 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Set timezone if TZ environment variable is provided
|
||||||
|
if [ -n "$TZ" ]; then
|
||||||
|
ln -snf /usr/share/zoneinfo/$TZ /etc/localtime
|
||||||
|
echo "$TZ" > /etc/timezone
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Set a default Gunicorn timeout if not provided via environment variable
|
||||||
|
: ${GUNICORN_TIMEOUT:=30}
|
||||||
|
echo "Starting Gunicorn with timeout ${GUNICORN_TIMEOUT} seconds..."
|
||||||
|
|
||||||
|
# Start Gunicorn with the specified timeout
|
||||||
|
exec gunicorn --timeout "$GUNICORN_TIMEOUT" --workers 4 --bind 0.0.0.0:5000 app:app
|
||||||
Binary file not shown.
BIN
output/e39fdf6f-2402-447c-a19e-560c2c6939a1/processed_files.zip
Normal file
BIN
output/e39fdf6f-2402-447c-a19e-560c2c6939a1/processed_files.zip
Normal file
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user