Module pdf2markdown

Expand source code
#!/usr/bin/env python3
# - Take PDF file input
# - Use pdftotext to extract text
# - Convert to basic Markdown (headings, paragraphs, bullet points)
# - Output to a .md file

import os
import sys
import argparse
import pdftotext

def extract_text_from_pdf(file_path):
    with open(file_path, "rb") as f:
        pdf = pdftotext.PDF(f)
    return pdf

def convert_to_markdown(pages, bullet='-', skip_lines=None, merge_lines=True, width=80):
    skip_lines = skip_lines or []
    md_output = []

    for i, page in enumerate(pages):
        lines = page.split('\\n')
        buffer = []

        for line in lines:
            line = line.strip()
            if any(skip in line for skip in skip_lines):
                continue

            # Convert bullets
            if line.startswith(bullet):
                buffer.append(f"- {line[len(bullet):].strip()}")
                continue

            # Merge short lines into paragraphs
            if merge_lines and len(line) < width:
                if buffer and not buffer[-1].endswith('.'):
                    buffer[-1] += ' ' + line
                else:
                    buffer.append(line)
            else:
                buffer.append(line)

        # Optional: add page title
        md_output.append(f"\\n\\n## Page {i+1}\\n")
        md_output.extend(buffer)

    return '\\n'.join(md_output)

def main():
    parser = argparse.ArgumentParser(description="Convert PDF to Markdown using pdftotext.")
    parser.add_argument("pdf_file", help="Path to the input PDF file")
    parser.add_argument("-o", "--output", help="Output Markdown file (default: same as input with .md)", default=None)
    parser.add_argument("--bullet", help="Bullet character to detect (default: '-')", default='-')
    parser.add_argument("--skip", help="Lines containing these substrings will be skipped (comma-separated)", default="")
    parser.add_argument("--width", type=int, help="Line width threshold to merge paragraphs (default: 80)", default=80)
    parser.add_argument("--no-merge", action="store_true", help="Do not merge lines into paragraphs")

    args = parser.parse_args()
    pdf_file = args.pdf_file
    output_file = args.output or os.path.splitext(pdf_file)[0] + ".md"
    skip_lines = [s.strip() for s in args.skip.split(',')] if args.skip else []

    pages = extract_text_from_pdf(pdf_file)
    markdown = convert_to_markdown(pages, bullet=args.bullet, skip_lines=skip_lines,
                                   merge_lines=not args.no_merge, width=args.width)

    with open(output_file, "w", encoding="utf-8") as f:
        f.write(markdown)

    print(f"Markdown saved to: {output_file}")

if __name__ == "__main__":
    main()

Functions

def convert_to_markdown(pages, bullet='-', skip_lines=None, merge_lines=True, width=80)
Expand source code
def convert_to_markdown(pages, bullet='-', skip_lines=None, merge_lines=True, width=80):
    skip_lines = skip_lines or []
    md_output = []

    for i, page in enumerate(pages):
        lines = page.split('\\n')
        buffer = []

        for line in lines:
            line = line.strip()
            if any(skip in line for skip in skip_lines):
                continue

            # Convert bullets
            if line.startswith(bullet):
                buffer.append(f"- {line[len(bullet):].strip()}")
                continue

            # Merge short lines into paragraphs
            if merge_lines and len(line) < width:
                if buffer and not buffer[-1].endswith('.'):
                    buffer[-1] += ' ' + line
                else:
                    buffer.append(line)
            else:
                buffer.append(line)

        # Optional: add page title
        md_output.append(f"\\n\\n## Page {i+1}\\n")
        md_output.extend(buffer)

    return '\\n'.join(md_output)
def extract_text_from_pdf(file_path)
Expand source code
def extract_text_from_pdf(file_path):
    with open(file_path, "rb") as f:
        pdf = pdftotext.PDF(f)
    return pdf
def main()
Expand source code
def main():
    parser = argparse.ArgumentParser(description="Convert PDF to Markdown using pdftotext.")
    parser.add_argument("pdf_file", help="Path to the input PDF file")
    parser.add_argument("-o", "--output", help="Output Markdown file (default: same as input with .md)", default=None)
    parser.add_argument("--bullet", help="Bullet character to detect (default: '-')", default='-')
    parser.add_argument("--skip", help="Lines containing these substrings will be skipped (comma-separated)", default="")
    parser.add_argument("--width", type=int, help="Line width threshold to merge paragraphs (default: 80)", default=80)
    parser.add_argument("--no-merge", action="store_true", help="Do not merge lines into paragraphs")

    args = parser.parse_args()
    pdf_file = args.pdf_file
    output_file = args.output or os.path.splitext(pdf_file)[0] + ".md"
    skip_lines = [s.strip() for s in args.skip.split(',')] if args.skip else []

    pages = extract_text_from_pdf(pdf_file)
    markdown = convert_to_markdown(pages, bullet=args.bullet, skip_lines=skip_lines,
                                   merge_lines=not args.no_merge, width=args.width)

    with open(output_file, "w", encoding="utf-8") as f:
        f.write(markdown)

    print(f"Markdown saved to: {output_file}")