First big push

This commit is contained in:
2025-03-18 22:46:53 +11:00
parent dbb6acf784
commit 699fc45de8
9 changed files with 325 additions and 0 deletions

120
app/app.py Normal file
View File

@@ -0,0 +1,120 @@
import os
import shutil
import subprocess
import uuid
from flask import Flask, request, render_template, send_file, jsonify
from werkzeug.utils import secure_filename
app = Flask(__name__)
BASE_UPLOAD_FOLDER = 'uploads'
BASE_OUTPUT_FOLDER = 'output'
os.makedirs(BASE_UPLOAD_FOLDER, exist_ok=True)
os.makedirs(BASE_OUTPUT_FOLDER, exist_ok=True)
def cleanup_session_folders(session_id):
"""Remove any existing session folders to ensure a clean slate."""
upload_folder = os.path.join(BASE_UPLOAD_FOLDER, session_id)
output_folder = os.path.join(BASE_OUTPUT_FOLDER, session_id)
if os.path.exists(upload_folder):
shutil.rmtree(upload_folder)
if os.path.exists(output_folder):
shutil.rmtree(output_folder)
def create_session_folders(session_id):
"""
Clears previous session folders if they exist and creates new ones.
This ensures that files from previous uploads won't mix with new ones.
"""
cleanup_session_folders(session_id)
upload_folder = os.path.join(BASE_UPLOAD_FOLDER, session_id)
output_folder = os.path.join(BASE_OUTPUT_FOLDER, session_id)
os.makedirs(upload_folder, exist_ok=True)
os.makedirs(output_folder, exist_ok=True)
return upload_folder, output_folder
def run_ocrmypdf(input_pdf, output_pdf):
cmd = [
'ocrmypdf',
'--rotate-pages',
'--deskew',
'--output-type', 'pdfa',
'--jobs', '4',
input_pdf,
output_pdf
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
if "PriorOcrFoundError" in result.stderr:
raise ValueError("This file already contains selectable text and was skipped.")
else:
raise RuntimeError(f"OCRmyPDF failed: {result.stderr}")
@app.route('/')
def index():
# Generate a new session id for each visitor.
session_id = str(uuid.uuid4())
return render_template('index.html', session_id=session_id)
@app.route('/upload/<session_id>', methods=['POST'])
def upload_files(session_id):
# Create fresh session folders, clearing any previous data.
upload_folder, output_folder = create_session_folders(session_id)
if 'files' not in request.files:
return jsonify({'error': 'No files were uploaded.'}), 400
files = request.files.getlist('files')
processed_files = []
skipped_files = []
for file in files:
if file.filename == '':
continue
filename = secure_filename(file.filename)
input_path = os.path.join(upload_folder, filename)
output_path = os.path.join(output_folder, filename)
file.save(input_path)
try:
run_ocrmypdf(input_path, output_path)
processed_files.append(filename)
except ValueError:
skipped_files.append(filename)
except Exception as e:
# If there's an error, cleanup the session folders
cleanup_session_folders(session_id)
return jsonify({'error': f"Failed to process {filename}: {str(e)}"}), 500
if not processed_files:
return jsonify({
'error': 'All files were skipped because they already contain selectable text.',
'skipped_files': skipped_files
})
# Create a zip file containing only the current upload's processed files.
zip_filename = os.path.join(output_folder, 'processed_files.zip')
subprocess.run(['zip', '-j', zip_filename] + [os.path.join(output_folder, f) for f in processed_files])
return jsonify({
'download_url': f'/download/{session_id}/processed_files.zip',
'processed_files': processed_files,
'skipped_files': skipped_files
})
@app.route('/download/<session_id>/<filename>')
def download_file(session_id, filename):
file_path = os.path.join(BASE_OUTPUT_FOLDER, session_id, filename)
return send_file(file_path, as_attachment=True)
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000, debug=False)

7
app/templates/bootstrap.min.css vendored Normal file

File diff suppressed because one or more lines are too long

153
app/templates/index.html Normal file
View File

@@ -0,0 +1,153 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>OCRmyPDF WebUI</title>
<meta name="viewport" content="width=device-width, initial-scale=1">
<!-- Inline minimal CSS for styling (no external dependencies) -->
<style>
/* Basic reset */
* {
box-sizing: border-box;
margin: 0;
padding: 0;
}
body {
font-family: Arial, sans-serif;
background-color: #f8f9fa;
padding: 20px;
}
.container {
max-width: 800px;
margin: auto;
background: #fff;
padding: 20px;
border-radius: 0.3em;
box-shadow: 0 0 10px rgba(0,0,0,0.1);
}
h2 {
margin-bottom: 20px;
}
input[type="file"] {
margin-bottom: 10px;
}
.btn {
display: inline-block;
padding: 0.5rem 1rem;
font-size: 1rem;
font-weight: bold;
text-align: center;
color: #fff;
background-color: #007bff;
border: none;
border-radius: 0.25rem;
cursor: pointer;
text-decoration: none;
}
.btn:disabled {
opacity: 0.65;
cursor: not-allowed;
}
.alert {
padding: 0.75rem 1.25rem;
margin: 1rem 0;
border: 1px solid transparent;
border-radius: 0.25rem;
}
.alert-info {
color: #055160;
background-color: #cff4fc;
border-color: #b6effb;
}
.alert-success {
color: #0f5132;
background-color: #d1e7dd;
border-color: #badbcc;
}
.alert-danger {
color: #842029;
background-color: #f8d7da;
border-color: #f5c2c7;
}
.mt-2 {
margin-top: 0.5rem;
}
.mt-3 {
margin-top: 1rem;
}
.mt-5 {
margin-top: 3rem;
}
.mb-4 {
margin-bottom: 1.5rem;
}
</style>
<!-- Inline JavaScript -->
<script>
async function uploadFiles(sessionId) {
const files = document.getElementById('fileInput').files;
const formData = new FormData();
const uploadBtn = document.getElementById('uploadBtn');
const status = document.getElementById('status');
const resultLink = document.getElementById('resultLink');
const processedList = document.getElementById('processedList');
const skippedList = document.getElementById('skippedList');
resultLink.style.display = 'none';
processedList.innerHTML = '';
skippedList.innerHTML = '';
status.innerHTML = '';
for (let i = 0; i < files.length; i++) {
formData.append('files', files[i]);
}
uploadBtn.disabled = true;
uploadBtn.innerHTML = 'Processing...';
status.innerHTML = '<div class="alert alert-info">Processing files, please wait...</div>';
try {
const response = await fetch(`/upload/${sessionId}`, {
method: 'POST',
body: formData
});
const result = await response.json();
if (result.download_url) {
status.innerHTML = '<div class="alert alert-success">Processing complete! Download your files below.</div>';
resultLink.href = result.download_url;
resultLink.style.display = 'block';
if (result.processed_files && result.processed_files.length > 0) {
processedList.innerHTML = '<strong>Processed Files:</strong><br>' + result.processed_files.join('<br>');
}
if (result.skipped_files && result.skipped_files.length > 0) {
skippedList.innerHTML = '<strong>Skipped Files (Already Contain Text):</strong><br>' + result.skipped_files.join('<br>');
}
} else {
status.innerHTML = '<div class="alert alert-danger">' + (result.error || 'Unknown error occurred.') + '</div>';
}
} catch (error) {
status.innerHTML = '<div class="alert alert-danger">Error: ' + error.message + '</div>';
} finally {
uploadBtn.disabled = false;
uploadBtn.innerHTML = 'Upload and Process';
}
}
</script>
</head>
<body>
<div class="container mt-5">
<h2 class="mb-4">OCRmyPDF WebUI</h2>
<input type="file" id="fileInput" multiple>
<button id="uploadBtn" class="btn" onclick="uploadFiles('{{ session_id }}')">Upload and Process</button>
<div id="status" class="mt-3"></div>
<div id="processedList" class="mt-2"></div>
<div id="skippedList" class="mt-2"></div>
<a id="resultLink" class="btn mt-3" href="#" style="display:none;">Download Processed Files (ZIP)</a>
</div>
</body>
</html>