Module remove_comments
Script to remove comments and docstrings from a Python file while keeping shebang and UTF-8 encoding declarations.
Usage
python remove_comments.py filename.py [suffix] [destination_folder]
Parameters
filename.py: The Python file to process (mandatory). suffix: Optional suffix for the output file (default: .nocomment). destination_folder: Optional folder for saving the processed file (default: same as input file).
Expand source code
#!/usr/bin/env python3
"""
Script to remove comments and docstrings from a Python file while keeping shebang and UTF-8 encoding declarations.
Usage:
python remove_comments.py filename.py [suffix] [destination_folder]
Parameters:
filename.py: The Python file to process (mandatory).
suffix: Optional suffix for the output file (default: .nocomment).
destination_folder: Optional folder for saving the processed file (default: same as input file).
"""
import sys
import os
import ast
import tokenize
from io import StringIO
import re
# Function to check if a file is Python code
def is_python_file(file_path):
"""Check if the given file is a Python file."""
_, ext = os.path.splitext(file_path)
return ext == ".py"
def remove_docstrings(source):
"""
Remove docstrings from the source code using the ast module.
Args:
source (str): The original Python source code.
Returns:
str: The source code without docstrings.
"""
try:
# Parse the source code into an AST
parsed_ast = ast.parse(source)
except SyntaxError as e:
print(f"Syntax error while parsing the file: {e}")
sys.exit(1)
# Define a NodeTransformer to remove docstrings
class DocstringRemover(ast.NodeTransformer):
def visit_FunctionDef(self, node):
self.generic_visit(node)
if ast.get_docstring(node):
node.body = node.body[1:]
return node
def visit_ClassDef(self, node):
self.generic_visit(node)
if ast.get_docstring(node):
node.body = node.body[1:]
return node
def visit_Module(self, node):
self.generic_visit(node)
if ast.get_docstring(node):
node.body = node.body[1:]
return node
# Remove docstrings
remover = DocstringRemover()
cleaned_ast = remover.visit(parsed_ast)
ast.fix_missing_locations(cleaned_ast)
# Convert the AST back to source code
try:
cleaned_source = ast.unparse(cleaned_ast)
except AttributeError:
# For Python versions < 3.9 where ast.unparse is not available
try:
import astor
cleaned_source = astor.to_source(cleaned_ast)
except ImportError:
print("For Python versions below 3.9, please install the 'astor' library: pip install astor")
sys.exit(1)
return cleaned_source
def remove_comments(source):
"""
Remove comments from the source code using the tokenize module.
Args:
source (str): The Python source code.
Returns:
str: The source code without comments.
"""
result = []
g = tokenize.generate_tokens(StringIO(source).readline)
try:
for toknum, tokval, start, end, line in g:
if toknum == tokenize.COMMENT:
continue # Skip comments
elif toknum == tokenize.NL:
# Preserve standalone newlines
result.append((toknum, tokval))
elif toknum == tokenize.NEWLINE:
result.append((toknum, tokval))
else:
result.append((toknum, tokval))
except tokenize.TokenError as e:
print(f"Tokenization error: {e}")
sys.exit(1)
cleaned_code = tokenize.untokenize(result)
return cleaned_code
def preserve_preserved_lines(source):
"""
Preserve shebang and encoding declarations.
Args:
source (str): The original Python source code.
Returns:
tuple: (preserved_lines, remaining_source)
"""
preserved_lines = []
remaining_source = source
lines = source.splitlines(keepends=True)
idx = 0
encoding_re = re.compile(r'coding[:=]\s*([-\w.]+)')
# Preserve shebang and encoding declarations at the top of the file
while idx < len(lines):
line = lines[idx]
if line.startswith('#!'):
preserved_lines.append(line)
idx += 1
elif line.startswith('#') and encoding_re.search(line):
preserved_lines.append(line)
idx += 1
elif line.strip() == '':
preserved_lines.append(line)
idx += 1
else:
break
remaining_source = ''.join(lines[idx:])
return preserved_lines, remaining_source
def minimize_blank_lines(source):
"""
Collapse multiple consecutive blank lines into a single blank line.
Args:
source (str): The Python source code.
Returns:
str: The source code with minimized blank lines.
"""
# Replace multiple blank lines with two newlines
source = re.sub(r'\n\s*\n+', '\n\n', source)
# Optionally, strip trailing whitespace on each line
source = '\n'.join(line.rstrip() for line in source.splitlines())
return source.strip() + '\n' # Ensure the file ends with a single newline
def remove_comments_and_docstrings(source):
"""
Remove both docstrings and comments from the source code.
Args:
source (str): The original Python source code.
Returns:
str: The cleaned source code.
"""
preserved_lines, remaining_source = preserve_preserved_lines(source)
# Remove docstrings
no_docstrings = remove_docstrings(remaining_source)
# Remove comments
no_comments = remove_comments(no_docstrings)
# Minimize blank lines
cleaned_code = minimize_blank_lines(no_comments)
# Combine preserved lines with cleaned code
final_content = ''.join(preserved_lines) + cleaned_code
return final_content
# Main function
def main():
if len(sys.argv) < 2:
print("Usage: python remove_comments.py filename.py [suffix] [destination_folder]")
sys.exit(1)
input_file = sys.argv[1]
suffix = sys.argv[2] if len(sys.argv) > 2 else ".nocomment"
destination_folder = sys.argv[3] if len(sys.argv) > 3 else None
if not os.path.isfile(input_file):
print(f"Error: File '{input_file}' not found.")
sys.exit(1)
if not is_python_file(input_file):
print(f"Error: File '{input_file}' is not a Python file.")
sys.exit(1)
try:
with open(input_file, "r", encoding="utf-8") as f:
file_content = f.read()
# Remove comments and docstrings
cleaned_content = remove_comments_and_docstrings(file_content)
# Generate output filename
base, ext = os.path.splitext(os.path.basename(input_file))
output_file = base + suffix + ext
if destination_folder:
# If the destination folder is relative, make it relative to the input file
if not os.path.isabs(destination_folder):
destination_folder = os.path.join(os.path.dirname(input_file), destination_folder)
os.makedirs(destination_folder, exist_ok=True)
output_file = os.path.join(destination_folder, output_file)
else:
output_file = os.path.join(os.path.dirname(input_file), output_file)
# Write the cleaned content to the output file
with open(output_file, "w", encoding="utf-8") as f:
f.write(cleaned_content)
print(f"Processed file saved to: {output_file}")
except Exception as e:
print(f"Error processing file: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
Functions
def is_python_file(file_path)
-
Check if the given file is a Python file.
Expand source code
def is_python_file(file_path): """Check if the given file is a Python file.""" _, ext = os.path.splitext(file_path) return ext == ".py"
def main()
-
Expand source code
def main(): if len(sys.argv) < 2: print("Usage: python remove_comments.py filename.py [suffix] [destination_folder]") sys.exit(1) input_file = sys.argv[1] suffix = sys.argv[2] if len(sys.argv) > 2 else ".nocomment" destination_folder = sys.argv[3] if len(sys.argv) > 3 else None if not os.path.isfile(input_file): print(f"Error: File '{input_file}' not found.") sys.exit(1) if not is_python_file(input_file): print(f"Error: File '{input_file}' is not a Python file.") sys.exit(1) try: with open(input_file, "r", encoding="utf-8") as f: file_content = f.read() # Remove comments and docstrings cleaned_content = remove_comments_and_docstrings(file_content) # Generate output filename base, ext = os.path.splitext(os.path.basename(input_file)) output_file = base + suffix + ext if destination_folder: # If the destination folder is relative, make it relative to the input file if not os.path.isabs(destination_folder): destination_folder = os.path.join(os.path.dirname(input_file), destination_folder) os.makedirs(destination_folder, exist_ok=True) output_file = os.path.join(destination_folder, output_file) else: output_file = os.path.join(os.path.dirname(input_file), output_file) # Write the cleaned content to the output file with open(output_file, "w", encoding="utf-8") as f: f.write(cleaned_content) print(f"Processed file saved to: {output_file}") except Exception as e: print(f"Error processing file: {e}") sys.exit(1)
def minimize_blank_lines(source)
-
Collapse multiple consecutive blank lines into a single blank line.
Args
source
:str
- The Python source code.
Returns
str
- The source code with minimized blank lines.
Expand source code
def minimize_blank_lines(source): """ Collapse multiple consecutive blank lines into a single blank line. Args: source (str): The Python source code. Returns: str: The source code with minimized blank lines. """ # Replace multiple blank lines with two newlines source = re.sub(r'\n\s*\n+', '\n\n', source) # Optionally, strip trailing whitespace on each line source = '\n'.join(line.rstrip() for line in source.splitlines()) return source.strip() + '\n' # Ensure the file ends with a single newline
def preserve_preserved_lines(source)
-
Preserve shebang and encoding declarations.
Args
source
:str
- The original Python source code.
Returns
tuple
- (preserved_lines, remaining_source)
Expand source code
def preserve_preserved_lines(source): """ Preserve shebang and encoding declarations. Args: source (str): The original Python source code. Returns: tuple: (preserved_lines, remaining_source) """ preserved_lines = [] remaining_source = source lines = source.splitlines(keepends=True) idx = 0 encoding_re = re.compile(r'coding[:=]\s*([-\w.]+)') # Preserve shebang and encoding declarations at the top of the file while idx < len(lines): line = lines[idx] if line.startswith('#!'): preserved_lines.append(line) idx += 1 elif line.startswith('#') and encoding_re.search(line): preserved_lines.append(line) idx += 1 elif line.strip() == '': preserved_lines.append(line) idx += 1 else: break remaining_source = ''.join(lines[idx:]) return preserved_lines, remaining_source
def remove_comments(source)
-
Remove comments from the source code using the tokenize module.
Args
source
:str
- The Python source code.
Returns
str
- The source code without comments.
Expand source code
def remove_comments(source): """ Remove comments from the source code using the tokenize module. Args: source (str): The Python source code. Returns: str: The source code without comments. """ result = [] g = tokenize.generate_tokens(StringIO(source).readline) try: for toknum, tokval, start, end, line in g: if toknum == tokenize.COMMENT: continue # Skip comments elif toknum == tokenize.NL: # Preserve standalone newlines result.append((toknum, tokval)) elif toknum == tokenize.NEWLINE: result.append((toknum, tokval)) else: result.append((toknum, tokval)) except tokenize.TokenError as e: print(f"Tokenization error: {e}") sys.exit(1) cleaned_code = tokenize.untokenize(result) return cleaned_code
def remove_comments_and_docstrings(source)
-
Remove both docstrings and comments from the source code.
Args
source
:str
- The original Python source code.
Returns
str
- The cleaned source code.
Expand source code
def remove_comments_and_docstrings(source): """ Remove both docstrings and comments from the source code. Args: source (str): The original Python source code. Returns: str: The cleaned source code. """ preserved_lines, remaining_source = preserve_preserved_lines(source) # Remove docstrings no_docstrings = remove_docstrings(remaining_source) # Remove comments no_comments = remove_comments(no_docstrings) # Minimize blank lines cleaned_code = minimize_blank_lines(no_comments) # Combine preserved lines with cleaned code final_content = ''.join(preserved_lines) + cleaned_code return final_content
def remove_docstrings(source)
-
Remove docstrings from the source code using the ast module.
Args
source
:str
- The original Python source code.
Returns
str
- The source code without docstrings.
Expand source code
def remove_docstrings(source): """ Remove docstrings from the source code using the ast module. Args: source (str): The original Python source code. Returns: str: The source code without docstrings. """ try: # Parse the source code into an AST parsed_ast = ast.parse(source) except SyntaxError as e: print(f"Syntax error while parsing the file: {e}") sys.exit(1) # Define a NodeTransformer to remove docstrings class DocstringRemover(ast.NodeTransformer): def visit_FunctionDef(self, node): self.generic_visit(node) if ast.get_docstring(node): node.body = node.body[1:] return node def visit_ClassDef(self, node): self.generic_visit(node) if ast.get_docstring(node): node.body = node.body[1:] return node def visit_Module(self, node): self.generic_visit(node) if ast.get_docstring(node): node.body = node.body[1:] return node # Remove docstrings remover = DocstringRemover() cleaned_ast = remover.visit(parsed_ast) ast.fix_missing_locations(cleaned_ast) # Convert the AST back to source code try: cleaned_source = ast.unparse(cleaned_ast) except AttributeError: # For Python versions < 3.9 where ast.unparse is not available try: import astor cleaned_source = astor.to_source(cleaned_ast) except ImportError: print("For Python versions below 3.9, please install the 'astor' library: pip install astor") sys.exit(1) return cleaned_source