buecher/process.py

import subprocess
import tempfile
import os
import sys
import uuid
import re

CAPTURE_AREA = '#captureArea'

def main(input_file):
    if not input_file:
        print("Usage: python3 script.py path/to/document.adoc")
        sys.exit(1)

    output_directory = os.path.dirname(input_file)
    output_basename = os.path.splitext(os.path.basename(input_file))[0]
    output_docx = os.path.join(output_directory, f"{output_basename}.docx")
    images_directory = "./generated_images"

    # Ensure the images directory exists
    os.makedirs(images_directory, exist_ok=True)

    # Create a temporary directory
    with tempfile.TemporaryDirectory() as temp_dir:
        html_path = os.path.join(temp_dir, f"{output_basename}_temp.html")

        # Convert AsciiDoc to HTML first, handling images next
        subprocess.run(['asciidoctor', '-b', 'html', '-o', html_path, input_file], check=True)

        # Read the generated HTML
        with open(html_path, 'r') as file:
            content = file.read()

        # Modify the content by replacing image placeholders with actual image tags
        modified_content = replace_images(content, images_directory)

        # Rewrite the modified HTML back to file
        with open(html_path, 'w') as file:
            file.write(modified_content)

        # Convert the final HTML to DOCX
        subprocess.run(['pandoc', '-f', 'html', '-t', 'docx', '-o', output_docx, html_path], check=True)

        print(f"DOCX file created successfully: {output_docx}")


def replace_images(content, image_dir):
    def replacement(match):
        html_content = match.group(1).strip()
        image_id = str(uuid.uuid4())
        output_path = os.path.join(image_dir, f"{image_id}.png")

        # Prepare the HTML content by wrapping it in an HTML document
        full_html = f"<html><body>{html_content}</body></html>"

        # Call the Node.js script to convert HTML to an image
        command = ['node', 'saveAsImage.js', full_html, output_path, CAPTURE_AREA]
        result = subprocess.run(command, capture_output=True, text=True)

        if result.returncode != 0:
            print("Failed to generate image:", result.stderr)
            return ""  # Return empty if image generation fails
        else:
            # Return the image tag for HTML
            return f'<img src="{output_path}" alt="Generated Image" style="width:75%;">'

    # Use re.DOTALL to ensure that multiline HTML content is matched correctly
    # The '?' in '.*?' makes the matching lazy, which helps in matching multiple instances correctly
    return re.sub(r'<!--image-->(.*?)<!--/image-->', replacement, content, flags=re.DOTALL)


if __name__ == "__main__":
    main(sys.argv[1])