Module pdf2markdown
Expand source code
#!/usr/bin/env python3
# - Take PDF file input
# - Use pdftotext to extract text
# - Convert to basic Markdown (headings, paragraphs, bullet points)
# - Output to a .md file
import os
import sys
import argparse
import pdftotext
def extract_text_from_pdf(file_path):
with open(file_path, "rb") as f:
pdf = pdftotext.PDF(f)
return pdf
def convert_to_markdown(pages, bullet='-', skip_lines=None, merge_lines=True, width=80):
skip_lines = skip_lines or []
md_output = []
for i, page in enumerate(pages):
lines = page.split('\\n')
buffer = []
for line in lines:
line = line.strip()
if any(skip in line for skip in skip_lines):
continue
# Convert bullets
if line.startswith(bullet):
buffer.append(f"- {line[len(bullet):].strip()}")
continue
# Merge short lines into paragraphs
if merge_lines and len(line) < width:
if buffer and not buffer[-1].endswith('.'):
buffer[-1] += ' ' + line
else:
buffer.append(line)
else:
buffer.append(line)
# Optional: add page title
md_output.append(f"\\n\\n## Page {i+1}\\n")
md_output.extend(buffer)
return '\\n'.join(md_output)
def main():
parser = argparse.ArgumentParser(description="Convert PDF to Markdown using pdftotext.")
parser.add_argument("pdf_file", help="Path to the input PDF file")
parser.add_argument("-o", "--output", help="Output Markdown file (default: same as input with .md)", default=None)
parser.add_argument("--bullet", help="Bullet character to detect (default: '-')", default='-')
parser.add_argument("--skip", help="Lines containing these substrings will be skipped (comma-separated)", default="")
parser.add_argument("--width", type=int, help="Line width threshold to merge paragraphs (default: 80)", default=80)
parser.add_argument("--no-merge", action="store_true", help="Do not merge lines into paragraphs")
args = parser.parse_args()
pdf_file = args.pdf_file
output_file = args.output or os.path.splitext(pdf_file)[0] + ".md"
skip_lines = [s.strip() for s in args.skip.split(',')] if args.skip else []
pages = extract_text_from_pdf(pdf_file)
markdown = convert_to_markdown(pages, bullet=args.bullet, skip_lines=skip_lines,
merge_lines=not args.no_merge, width=args.width)
with open(output_file, "w", encoding="utf-8") as f:
f.write(markdown)
print(f"Markdown saved to: {output_file}")
if __name__ == "__main__":
main()
Functions
def convert_to_markdown(pages, bullet='-', skip_lines=None, merge_lines=True, width=80)
-
Expand source code
def convert_to_markdown(pages, bullet='-', skip_lines=None, merge_lines=True, width=80): skip_lines = skip_lines or [] md_output = [] for i, page in enumerate(pages): lines = page.split('\\n') buffer = [] for line in lines: line = line.strip() if any(skip in line for skip in skip_lines): continue # Convert bullets if line.startswith(bullet): buffer.append(f"- {line[len(bullet):].strip()}") continue # Merge short lines into paragraphs if merge_lines and len(line) < width: if buffer and not buffer[-1].endswith('.'): buffer[-1] += ' ' + line else: buffer.append(line) else: buffer.append(line) # Optional: add page title md_output.append(f"\\n\\n## Page {i+1}\\n") md_output.extend(buffer) return '\\n'.join(md_output)
def extract_text_from_pdf(file_path)
-
Expand source code
def extract_text_from_pdf(file_path): with open(file_path, "rb") as f: pdf = pdftotext.PDF(f) return pdf
def main()
-
Expand source code
def main(): parser = argparse.ArgumentParser(description="Convert PDF to Markdown using pdftotext.") parser.add_argument("pdf_file", help="Path to the input PDF file") parser.add_argument("-o", "--output", help="Output Markdown file (default: same as input with .md)", default=None) parser.add_argument("--bullet", help="Bullet character to detect (default: '-')", default='-') parser.add_argument("--skip", help="Lines containing these substrings will be skipped (comma-separated)", default="") parser.add_argument("--width", type=int, help="Line width threshold to merge paragraphs (default: 80)", default=80) parser.add_argument("--no-merge", action="store_true", help="Do not merge lines into paragraphs") args = parser.parse_args() pdf_file = args.pdf_file output_file = args.output or os.path.splitext(pdf_file)[0] + ".md" skip_lines = [s.strip() for s in args.skip.split(',')] if args.skip else [] pages = extract_text_from_pdf(pdf_file) markdown = convert_to_markdown(pages, bullet=args.bullet, skip_lines=skip_lines, merge_lines=not args.no_merge, width=args.width) with open(output_file, "w", encoding="utf-8") as f: f.write(markdown) print(f"Markdown saved to: {output_file}")