First big push
This commit is contained in:
21
Dockerfile
Normal file
21
Dockerfile
Normal file
@@ -0,0 +1,21 @@
|
||||
FROM jbarlow83/ocrmypdf:latest
|
||||
|
||||
# Install packages, including tzdata for timezone support
|
||||
RUN apt-get update && apt-get install -y python3-flask python3-pip zip tzdata
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Copy application code and entrypoint script
|
||||
COPY app /app
|
||||
COPY entrypoint.sh /app/entrypoint.sh
|
||||
RUN chmod +x /app/entrypoint.sh
|
||||
|
||||
# Install gunicorn using pip with the flag to override external management
|
||||
RUN pip3 install --break-system-packages gunicorn
|
||||
|
||||
# Expose the application port
|
||||
EXPOSE 5000
|
||||
|
||||
# Override the base image's ENTRYPOINT with our custom script
|
||||
ENTRYPOINT ["/app/entrypoint.sh"]
|
||||
CMD []
|
||||
120
app/app.py
Normal file
120
app/app.py
Normal file
@@ -0,0 +1,120 @@
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import uuid
|
||||
from flask import Flask, request, render_template, send_file, jsonify
|
||||
from werkzeug.utils import secure_filename
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
BASE_UPLOAD_FOLDER = 'uploads'
|
||||
BASE_OUTPUT_FOLDER = 'output'
|
||||
os.makedirs(BASE_UPLOAD_FOLDER, exist_ok=True)
|
||||
os.makedirs(BASE_OUTPUT_FOLDER, exist_ok=True)
|
||||
|
||||
|
||||
def cleanup_session_folders(session_id):
|
||||
"""Remove any existing session folders to ensure a clean slate."""
|
||||
upload_folder = os.path.join(BASE_UPLOAD_FOLDER, session_id)
|
||||
output_folder = os.path.join(BASE_OUTPUT_FOLDER, session_id)
|
||||
if os.path.exists(upload_folder):
|
||||
shutil.rmtree(upload_folder)
|
||||
if os.path.exists(output_folder):
|
||||
shutil.rmtree(output_folder)
|
||||
|
||||
|
||||
def create_session_folders(session_id):
|
||||
"""
|
||||
Clears previous session folders if they exist and creates new ones.
|
||||
This ensures that files from previous uploads won't mix with new ones.
|
||||
"""
|
||||
cleanup_session_folders(session_id)
|
||||
upload_folder = os.path.join(BASE_UPLOAD_FOLDER, session_id)
|
||||
output_folder = os.path.join(BASE_OUTPUT_FOLDER, session_id)
|
||||
os.makedirs(upload_folder, exist_ok=True)
|
||||
os.makedirs(output_folder, exist_ok=True)
|
||||
return upload_folder, output_folder
|
||||
|
||||
|
||||
def run_ocrmypdf(input_pdf, output_pdf):
|
||||
cmd = [
|
||||
'ocrmypdf',
|
||||
'--rotate-pages',
|
||||
'--deskew',
|
||||
'--output-type', 'pdfa',
|
||||
'--jobs', '4',
|
||||
input_pdf,
|
||||
output_pdf
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
if "PriorOcrFoundError" in result.stderr:
|
||||
raise ValueError("This file already contains selectable text and was skipped.")
|
||||
else:
|
||||
raise RuntimeError(f"OCRmyPDF failed: {result.stderr}")
|
||||
|
||||
|
||||
@app.route('/')
|
||||
def index():
|
||||
# Generate a new session id for each visitor.
|
||||
session_id = str(uuid.uuid4())
|
||||
return render_template('index.html', session_id=session_id)
|
||||
|
||||
|
||||
@app.route('/upload/<session_id>', methods=['POST'])
|
||||
def upload_files(session_id):
|
||||
# Create fresh session folders, clearing any previous data.
|
||||
upload_folder, output_folder = create_session_folders(session_id)
|
||||
|
||||
if 'files' not in request.files:
|
||||
return jsonify({'error': 'No files were uploaded.'}), 400
|
||||
|
||||
files = request.files.getlist('files')
|
||||
processed_files = []
|
||||
skipped_files = []
|
||||
|
||||
for file in files:
|
||||
if file.filename == '':
|
||||
continue
|
||||
|
||||
filename = secure_filename(file.filename)
|
||||
input_path = os.path.join(upload_folder, filename)
|
||||
output_path = os.path.join(output_folder, filename)
|
||||
|
||||
file.save(input_path)
|
||||
|
||||
try:
|
||||
run_ocrmypdf(input_path, output_path)
|
||||
processed_files.append(filename)
|
||||
except ValueError:
|
||||
skipped_files.append(filename)
|
||||
except Exception as e:
|
||||
# If there's an error, cleanup the session folders
|
||||
cleanup_session_folders(session_id)
|
||||
return jsonify({'error': f"Failed to process {filename}: {str(e)}"}), 500
|
||||
|
||||
if not processed_files:
|
||||
return jsonify({
|
||||
'error': 'All files were skipped because they already contain selectable text.',
|
||||
'skipped_files': skipped_files
|
||||
})
|
||||
|
||||
# Create a zip file containing only the current upload's processed files.
|
||||
zip_filename = os.path.join(output_folder, 'processed_files.zip')
|
||||
subprocess.run(['zip', '-j', zip_filename] + [os.path.join(output_folder, f) for f in processed_files])
|
||||
|
||||
return jsonify({
|
||||
'download_url': f'/download/{session_id}/processed_files.zip',
|
||||
'processed_files': processed_files,
|
||||
'skipped_files': skipped_files
|
||||
})
|
||||
|
||||
|
||||
@app.route('/download/<session_id>/<filename>')
|
||||
def download_file(session_id, filename):
|
||||
file_path = os.path.join(BASE_OUTPUT_FOLDER, session_id, filename)
|
||||
return send_file(file_path, as_attachment=True)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(host='0.0.0.0', port=5000, debug=False)
|
||||
7
app/templates/bootstrap.min.css
vendored
Normal file
7
app/templates/bootstrap.min.css
vendored
Normal file
File diff suppressed because one or more lines are too long
153
app/templates/index.html
Normal file
153
app/templates/index.html
Normal file
@@ -0,0 +1,153 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>OCRmyPDF WebUI</title>
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
|
||||
<!-- Inline minimal CSS for styling (no external dependencies) -->
|
||||
<style>
|
||||
/* Basic reset */
|
||||
* {
|
||||
box-sizing: border-box;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
}
|
||||
body {
|
||||
font-family: Arial, sans-serif;
|
||||
background-color: #f8f9fa;
|
||||
padding: 20px;
|
||||
}
|
||||
.container {
|
||||
max-width: 800px;
|
||||
margin: auto;
|
||||
background: #fff;
|
||||
padding: 20px;
|
||||
border-radius: 0.3em;
|
||||
box-shadow: 0 0 10px rgba(0,0,0,0.1);
|
||||
}
|
||||
h2 {
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
input[type="file"] {
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
.btn {
|
||||
display: inline-block;
|
||||
padding: 0.5rem 1rem;
|
||||
font-size: 1rem;
|
||||
font-weight: bold;
|
||||
text-align: center;
|
||||
color: #fff;
|
||||
background-color: #007bff;
|
||||
border: none;
|
||||
border-radius: 0.25rem;
|
||||
cursor: pointer;
|
||||
text-decoration: none;
|
||||
}
|
||||
.btn:disabled {
|
||||
opacity: 0.65;
|
||||
cursor: not-allowed;
|
||||
}
|
||||
.alert {
|
||||
padding: 0.75rem 1.25rem;
|
||||
margin: 1rem 0;
|
||||
border: 1px solid transparent;
|
||||
border-radius: 0.25rem;
|
||||
}
|
||||
.alert-info {
|
||||
color: #055160;
|
||||
background-color: #cff4fc;
|
||||
border-color: #b6effb;
|
||||
}
|
||||
.alert-success {
|
||||
color: #0f5132;
|
||||
background-color: #d1e7dd;
|
||||
border-color: #badbcc;
|
||||
}
|
||||
.alert-danger {
|
||||
color: #842029;
|
||||
background-color: #f8d7da;
|
||||
border-color: #f5c2c7;
|
||||
}
|
||||
.mt-2 {
|
||||
margin-top: 0.5rem;
|
||||
}
|
||||
.mt-3 {
|
||||
margin-top: 1rem;
|
||||
}
|
||||
.mt-5 {
|
||||
margin-top: 3rem;
|
||||
}
|
||||
.mb-4 {
|
||||
margin-bottom: 1.5rem;
|
||||
}
|
||||
</style>
|
||||
|
||||
<!-- Inline JavaScript -->
|
||||
<script>
|
||||
async function uploadFiles(sessionId) {
|
||||
const files = document.getElementById('fileInput').files;
|
||||
const formData = new FormData();
|
||||
const uploadBtn = document.getElementById('uploadBtn');
|
||||
const status = document.getElementById('status');
|
||||
const resultLink = document.getElementById('resultLink');
|
||||
const processedList = document.getElementById('processedList');
|
||||
const skippedList = document.getElementById('skippedList');
|
||||
|
||||
resultLink.style.display = 'none';
|
||||
processedList.innerHTML = '';
|
||||
skippedList.innerHTML = '';
|
||||
status.innerHTML = '';
|
||||
|
||||
for (let i = 0; i < files.length; i++) {
|
||||
formData.append('files', files[i]);
|
||||
}
|
||||
|
||||
uploadBtn.disabled = true;
|
||||
uploadBtn.innerHTML = 'Processing...';
|
||||
status.innerHTML = '<div class="alert alert-info">Processing files, please wait...</div>';
|
||||
|
||||
try {
|
||||
const response = await fetch(`/upload/${sessionId}`, {
|
||||
method: 'POST',
|
||||
body: formData
|
||||
});
|
||||
|
||||
const result = await response.json();
|
||||
|
||||
if (result.download_url) {
|
||||
status.innerHTML = '<div class="alert alert-success">Processing complete! Download your files below.</div>';
|
||||
resultLink.href = result.download_url;
|
||||
resultLink.style.display = 'block';
|
||||
|
||||
if (result.processed_files && result.processed_files.length > 0) {
|
||||
processedList.innerHTML = '<strong>Processed Files:</strong><br>' + result.processed_files.join('<br>');
|
||||
}
|
||||
if (result.skipped_files && result.skipped_files.length > 0) {
|
||||
skippedList.innerHTML = '<strong>Skipped Files (Already Contain Text):</strong><br>' + result.skipped_files.join('<br>');
|
||||
}
|
||||
} else {
|
||||
status.innerHTML = '<div class="alert alert-danger">' + (result.error || 'Unknown error occurred.') + '</div>';
|
||||
}
|
||||
} catch (error) {
|
||||
status.innerHTML = '<div class="alert alert-danger">Error: ' + error.message + '</div>';
|
||||
} finally {
|
||||
uploadBtn.disabled = false;
|
||||
uploadBtn.innerHTML = 'Upload and Process';
|
||||
}
|
||||
}
|
||||
</script>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container mt-5">
|
||||
<h2 class="mb-4">OCRmyPDF WebUI</h2>
|
||||
<input type="file" id="fileInput" multiple>
|
||||
<button id="uploadBtn" class="btn" onclick="uploadFiles('{{ session_id }}')">Upload and Process</button>
|
||||
<div id="status" class="mt-3"></div>
|
||||
<div id="processedList" class="mt-2"></div>
|
||||
<div id="skippedList" class="mt-2"></div>
|
||||
<a id="resultLink" class="btn mt-3" href="#" style="display:none;">Download Processed Files (ZIP)</a>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
11
docker-compose.yaml
Normal file
11
docker-compose.yaml
Normal file
@@ -0,0 +1,11 @@
|
||||
services:
|
||||
webui:
|
||||
build: .
|
||||
ports:
|
||||
- "5000:5000"
|
||||
# volumes:
|
||||
# - ./uploads:/app/uploads
|
||||
# - ./output:/app/output
|
||||
environment:
|
||||
- GUNICORN_TIMEOUT=300
|
||||
- TZ=Australia/Sydney
|
||||
13
entrypoint.sh
Executable file
13
entrypoint.sh
Executable file
@@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
# Set timezone if TZ environment variable is provided
|
||||
if [ -n "$TZ" ]; then
|
||||
ln -snf /usr/share/zoneinfo/$TZ /etc/localtime
|
||||
echo "$TZ" > /etc/timezone
|
||||
fi
|
||||
|
||||
# Set a default Gunicorn timeout if not provided via environment variable
|
||||
: ${GUNICORN_TIMEOUT:=30}
|
||||
echo "Starting Gunicorn with timeout ${GUNICORN_TIMEOUT} seconds..."
|
||||
|
||||
# Start Gunicorn with the specified timeout
|
||||
exec gunicorn --timeout "$GUNICORN_TIMEOUT" --workers 4 --bind 0.0.0.0:5000 app:app
|
||||
Binary file not shown.
BIN
output/e39fdf6f-2402-447c-a19e-560c2c6939a1/processed_files.zip
Normal file
BIN
output/e39fdf6f-2402-447c-a19e-560c2c6939a1/processed_files.zip
Normal file
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user