buecher/process.py
2024-08-24 19:12:44 +02:00

74 lines
2.6 KiB
Python

import subprocess
import tempfile
import os
import sys
import uuid
import re
CAPTURE_AREA = '#captureArea'
def main(input_file):
if not input_file:
print("Usage: python3 script.py path/to/document.adoc")
sys.exit(1)
output_directory = os.path.dirname(input_file)
output_basename = os.path.splitext(os.path.basename(input_file))[0]
output_docx = os.path.join(output_directory, f"{output_basename}.docx")
images_directory = "./generated_images"
# Ensure the images directory exists
os.makedirs(images_directory, exist_ok=True)
# Create a temporary directory
with tempfile.TemporaryDirectory() as temp_dir:
html_path = os.path.join(temp_dir, f"{output_basename}_temp.html")
# Convert AsciiDoc to HTML first, handling images next
subprocess.run(['asciidoctor', '-b', 'html', '-o', html_path, input_file], check=True)
# Read the generated HTML
with open(html_path, 'r') as file:
content = file.read()
# Modify the content by replacing image placeholders with actual image tags
modified_content = replace_images(content, images_directory)
# Rewrite the modified HTML back to file
with open(html_path, 'w') as file:
file.write(modified_content)
# Convert the final HTML to DOCX
subprocess.run(['pandoc', '-f', 'html', '-t', 'docx', '-o', output_docx, html_path], check=True)
print(f"DOCX file created successfully: {output_docx}")
def replace_images(content, image_dir):
def replacement(match):
html_content = match.group(1).strip()
image_id = str(uuid.uuid4())
output_path = os.path.join(image_dir, f"{image_id}.png")
# Prepare the HTML content by wrapping it in an HTML document
full_html = f"<html><body>{html_content}</body></html>"
# Call the Node.js script to convert HTML to an image
command = ['node', 'saveAsImage.js', full_html, output_path, CAPTURE_AREA]
result = subprocess.run(command, capture_output=True, text=True)
if result.returncode != 0:
print("Failed to generate image:", result.stderr)
return "" # Return empty if image generation fails
else:
# Return the image tag for HTML
return f'<img src="{output_path}" alt="Generated Image" style="width:75%;">'
# Use re.DOTALL to ensure that multiline HTML content is matched correctly
# The '?' in '.*?' makes the matching lazy, which helps in matching multiple instances correctly
return re.sub(r'<!--image-->(.*?)<!--/image-->', replacement, content, flags=re.DOTALL)
if __name__ == "__main__":
main(sys.argv[1])