doc-gen/doc_think.py

#!/usr/bin/env python3
"""
Python Documentation Generator using Ollama LLM
Automatically generates comprehensive markdown documentation for Python projects.
"""

import os
import ast
import json
import argparse
import subprocess
import sys
from pathlib import Path
from typing import Dict, List, Set, Tuple, Optional
import requests
import re
from urllib.parse import quote

class PythonAnalyzer:
    """Analyzes Python files to extract structural information."""

    def __init__(self):
        self.imports = set()
        self.classes = []
        self.functions = []
        self.constants = []
        self.module_docstring = None

    def analyze_file(self, file_path: str) -> Dict:
        """Analyze a Python file and extract its structure."""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()

            tree = ast.parse(content)

            # Reset for each file
            self.imports = set()
            self.classes = []
            self.functions = []
            self.constants = []
            self.module_docstring = ast.get_docstring(tree)

            for node in ast.walk(tree):
                if isinstance(node, ast.Import):
                    for alias in node.names:
                        self.imports.add(alias.name)
                elif isinstance(node, ast.ImportFrom):
                    module = node.module or ""
                    for alias in node.names:
                        self.imports.add(f"{module}.{alias.name}")
                elif isinstance(node, ast.ClassDef):
                    self.classes.append({
                        'name': node.name,
                        'bases': [ast.unparse(base) for base in node.bases],
                        'docstring': ast.get_docstring(node),
                        'methods': [n.name for n in node.body if isinstance(n, ast.FunctionDef)],
                        'lineno': node.lineno
                    })
                elif isinstance(node, ast.FunctionDef):
                    # Only top-level functions (not methods)
                    parent_classes = [n for n in ast.walk(tree) if isinstance(n, ast.ClassDef)]
                    is_method = False
                    for cls in parent_classes:
                        if hasattr(cls, 'body') and node in cls.body:
                            is_method = True
                            break

                    if not is_method:
                        self.functions.append({
                            'name': node.name,
                            'args': [arg.arg for arg in node.args.args],
                            'docstring': ast.get_docstring(node),
                            'lineno': node.lineno,
                            'returns': ast.unparse(node.returns) if node.returns else None
                        })
                elif isinstance(node, ast.Assign):
                    # Top-level constants (ALL_CAPS variables)
                    for target in node.targets:
                        if isinstance(target, ast.Name) and target.id.isupper():
                            self.constants.append({
                                'name': target.id,
                                'value': ast.unparse(node.value),
                                'lineno': node.lineno
                            })

            return {
                'file_path': file_path,
                'content': content,
                'module_docstring': self.module_docstring,
                'imports': list(self.imports),
                'classes': self.classes,
                'functions': self.functions,
                'constants': self.constants,
                'lines_of_code': len(content.splitlines())
            }

        except Exception as e:
            print(f"Error analyzing {file_path}: {e}")
            return None

class OllamaDocGenerator:
    """Generates documentation using Ollama LLM."""

    def __init__(self, model_name: str = "deepseek-r1:latest", ollama_url: str = "http://localhost:11434", thinking: bool = False):
        self.model_name = model_name
        self.ollama_url = ollama_url
        self.session = requests.Session()
        self.thinking = thinking

    def check_ollama_connection(self) -> bool:
        """Check if Ollama is running and accessible."""
        try:
            response = self.session.get(f"{self.ollama_url}/api/tags")
            return response.status_code == 200
        except requests.exceptions.RequestException:
            return False

    def check_model_availability(self) -> bool:
        """Check if the specified model is available."""
        try:
            response = self.session.get(f"{self.ollama_url}/api/tags")
            if response.status_code == 200:
                models = response.json().get('models', [])
                return any(model['name'].startswith(self.model_name) for model in models)
            return False
        except requests.exceptions.RequestException:
            return False

    def generate_documentation(self, file_analysis: Dict, project_context: Dict) -> str:
        """Generate documentation for a single Python file."""

        # Create comprehensive prompt with context
        prompt = self.create_documentation_prompt(file_analysis, project_context)

        # Check if this is a thinking model (o1-like models)
        is_thinking_model = self.thinking
        try:
            if is_thinking_model:
                print("Thinking model chosen")
                # For thinking models, use chat format and handle thinking tokens
                response = self.session.post(
                    f"{self.ollama_url}/api/chat",
                    json={
                        "model": self.model_name,
                        "messages": [
                            {
                                "role": "user",
                                "content": prompt
                            }
                        ],
                        "stream": False,
                        "options": {
                            "temperature": 0.1,
                            "top_p": 0.9,
                        }
                    },
                    timeout=600  # 10 minute timeout for thinking models
                )

                if response.status_code == 200:
                    result = response.json()
                    message = result.get('message', {})
                    content = message.get('content', '')
                    # Parse and display thinking process
                    thinking_content, final_answer = self.parse_thinking_response(content)

                    if thinking_content:
                        print(f"  🧠 Model thinking process:")
                        print(f"     {thinking_content[:200]}..." if len(thinking_content) > 200 else f"     {thinking_content}")

                    return final_answer if final_answer else content
                else:
                    print(f"Error generating documentation: {response.status_code}")
                    return None
            else:
                print("None thinking model chosen")
                # Standard generation for regular models
                response = self.session.post(
                    f"{self.ollama_url}/api/generate",
                    json={
                        "model": self.model_name,
                        "prompt": prompt,
                        "stream": False,
                        "think": False,
                        "options": {
                            "temperature": 0.1,
                            "top_p": 0.9,
                        }
                    },
                    timeout=300  # 5 minute timeout
                )

                if response.status_code == 200:
                    return response.json()['response']
                else:
                    print(f"Error generating documentation: {response.status_code}")
                    return None

        except requests.exceptions.RequestException as e:
            print(f"Error communicating with Ollama: {e}")
            return None

    def parse_thinking_response(self, content: str) -> Tuple[Optional[str], str]:
        """Parse thinking model response to extract thinking process and final answer."""
        import re

        # Try different thinking tag patterns
        thinking_patterns = [
            r'<thinking>(.*?)</thinking>',
            r'<think>(.*?)</think>',
            r'<reasoning>(.*?)</reasoning>',
            r'<analysis>(.*?)</analysis>'
        ]

        thinking_content = None
        final_answer = content

        for pattern in thinking_patterns:
            match = re.search(pattern, content, re.DOTALL)
            if match:
                thinking_content = match.group(1).strip()
                # Remove thinking section from final answer
                final_answer = re.sub(pattern, '', content, flags=re.DOTALL).strip()
                break

        # If no thinking tags found, check for other patterns like "I need to think about..."
        if not thinking_content:
            # Look for thinking indicators at the start
            thinking_indicators = [
                r'^(Let me think about.*?(?=\n\n|\n#|\nI\'ll))',
                r'^(I need to analyze.*?(?=\n\n|\n#|\nI\'ll))',
                r'^(First, let me understand.*?(?=\n\n|\n#|\nI\'ll))',
                r'^(To document this.*?(?=\n\n|\n#|\nI\'ll))'
            ]

            for pattern in thinking_indicators:
                match = re.search(pattern, content, re.DOTALL | re.MULTILINE)
                if match:
                    thinking_content = match.group(1).strip()
                    final_answer = content[match.end():].strip()
                    break

        return thinking_content, final_answer

    def create_documentation_prompt(self, file_analysis: Dict, project_context: Dict) -> str:
        """Create a comprehensive prompt for documentation generation."""

        file_path = file_analysis['file_path']
        relative_path = os.path.relpath(file_path, project_context['root_path'])

        prompt = f"""You are a technical documentation expert. Generate comprehensive markdown documentation for the Python file: `{relative_path}`

## PROJECT CONTEXT:
- **Project Root**: {project_context['root_path']}
- **Total Python Files**: {len(project_context['all_files'])}
- **External Dependencies**: {', '.join(project_context['external_dependencies']) if project_context['external_dependencies'] else 'None detected'}
- **Project Structure**:
{self.format_project_structure(project_context['file_structure'])}

## FILE ANALYSIS:
- **File Path**: `{relative_path}`
- **Lines of Code**: {file_analysis['lines_of_code']}
- **Module Docstring**: {file_analysis['module_docstring'] or 'None'}

### Imports ({len(file_analysis['imports'])} total):
{chr(10).join(f'- `{imp}`' for imp in file_analysis['imports'])}

### Classes ({len(file_analysis['classes'])} total):
{self.format_classes(file_analysis['classes'])}

### Functions ({len(file_analysis['functions'])} total):
{self.format_functions(file_analysis['functions'])}

### Constants ({len(file_analysis['constants'])} total):
{self.format_constants(file_analysis['constants'])}

## RELATED FILES:
{self.format_related_files(file_analysis, project_context)}

## FULL SOURCE CODE:
```python
{file_analysis['content']}
```

## DOCUMENTATION REQUIREMENTS:

Generate a complete markdown documentation file that includes:

1. **File Header**: Title ('Documentation ' + file), purpose, and brief description
2. **Overview**: What this module/file does and its role in the project
3. **Dependencies**: External and internal dependencies with explanations
4. **API Reference**: Detailed documentation of all classes, functions, and constants
5. **Usage Examples**: Practical code examples where applicable
6. **Cross-References**: Links to related files using relative markdown links
7. **Implementation Notes**: Architecture decisions, patterns used, etc.

## FORMATTING GUIDELINES:
- YOUR ARE **NOT ALLOWED** TO USE markdown CODE BLOCKS!
- Use proper markdown syntax, so no **# title** or other none standard markdown features
- Be carefull with indentation
- Limite the use of unecessary newlines
- Include code blocks with syntax highlighting
- Add tables for parameter/return value documentation
- Use relative links to other documentation files: `[filename](./filename.md)`
- Include line number references where helpful
- Make it professional and comprehensive
- Focus on clarity and usefulness for developers

Generate the complete markdown documentation now:"""

        return prompt

    def format_project_structure(self, file_structure: Dict) -> str:
        """Format project structure for the prompt."""
        lines = []
        for root, dirs, files in file_structure:
            level = root.replace(file_structure[0][0], '').count(os.sep)
            indent = '  ' * level
            lines.append(f"{indent}- {os.path.basename(root)}/")
            subindent = '  ' * (level + 1)
            for file in files:
                if file.endswith('.py'):
                    lines.append(f"{subindent}- {file}")
        return '\n'.join(lines[:20])  # Limit to first 20 lines

    def format_classes(self, classes: List[Dict]) -> str:
        """Format class information for the prompt."""
        if not classes:
            return "None"

        lines = []
        for cls in classes:
            lines.append(f"- **{cls['name']}** (line {cls['lineno']})")
            if cls['bases']:
                lines.append(f"  - Inherits from: {', '.join(cls['bases'])}")
            if cls['methods']:
                lines.append(f"  - Methods: {', '.join(cls['methods'])}")
            if cls['docstring']:
                lines.append(f"  - Description: {cls['docstring'][:100]}...")
        return '\n'.join(lines)

    def format_functions(self, functions: List[Dict]) -> str:
        """Format function information for the prompt."""
        if not functions:
            return "None"

        lines = []
        for func in functions:
            args_str = ', '.join(func['args']) if func['args'] else ''
            lines.append(f"- **{func['name']}({args_str})** (line {func['lineno']})")
            if func['returns']:
                lines.append(f"  - Returns: {func['returns']}")
            if func['docstring']:
                lines.append(f"  - Description: {func['docstring'][:100]}...")
        return '\n'.join(lines)

    def format_constants(self, constants: List[Dict]) -> str:
        """Format constant information for the prompt."""
        if not constants:
            return "None"

        lines = []
        for const in constants:
            lines.append(f"- **{const['name']}** = {const['value']} (line {const['lineno']})")
        return '\n'.join(lines)

    def format_related_files(self, file_analysis: Dict, project_context: Dict) -> str:
        """Format related files information."""
        current_imports = set(file_analysis['imports'])
        related_files = []

        for other_file in project_context['all_files']:
            if other_file != file_analysis['file_path']:
                rel_path = os.path.relpath(other_file, project_context['root_path'])
                module_name = rel_path.replace('/', '.').replace('\\', '.').replace('.py', '')

                # Check if this file imports the other or vice versa
                if any(imp.startswith(module_name) for imp in current_imports):
                    related_files.append(f"- `{rel_path}` (imported by this file)")

        return '\n'.join(related_files) if related_files else "None detected"

class ProjectAnalyzer:
    """Analyzes the entire project structure."""

    def __init__(self, root_path: str):
        self.root_path = Path(root_path).resolve()
        self.python_files = []
        self.external_dependencies = set()

    def scan_project(self, exclude_dirs: List[str] = None) -> Dict:
        """Scan the project and collect all Python files."""
        if exclude_dirs is None: exclude_dirs = ['.git', '__pycache__', '.pytest_cache', 'venv', 'env', '.venv', 'node_modules']
        else:  exclude_dirs = exclude_dirs + ['.git', '__pycache__', '.pytest_cache', 'venv', 'env', '.venv', 'node_modules']

        self.python_files = []
        file_structure = []

        for root, dirs, files in os.walk(self.root_path):
            # Remove excluded directories
            dirs[:] = [d for d in dirs if d not in exclude_dirs]
            files[:] = [f for f in files if f not in exclude_dirs]
            file_structure.append((root, dirs, files))

            for file in files:
                if file.endswith('.py'):
                    self.python_files.append(os.path.join(root, file))

        # Analyze dependencies
        self.analyze_dependencies()

        return {
            'root_path': str(self.root_path),
            'all_files': self.python_files,
            'file_structure': file_structure,
            'external_dependencies': list(self.external_dependencies)
        }

    def analyze_dependencies(self):
        """Analyze external dependencies across all Python files."""
        analyzer = PythonAnalyzer()

        for file_path in self.python_files:
            analysis = analyzer.analyze_file(file_path)
            if analysis:
                for imp in analysis['imports']:
                    # Check if it's an external dependency (not local)
                    if not self.is_local_import(imp):
                        self.external_dependencies.add(imp.split('.')[0])

    def is_local_import(self, import_name: str) -> bool:
        """Check if an import is local to the project."""
        # Simple heuristic: if the import starts with a relative path or matches a local file
        if import_name.startswith('.'):
            return True

        # Check if it matches any of our Python files
        for py_file in self.python_files:
            rel_path = os.path.relpath(py_file, self.root_path)
            module_path = rel_path.replace('/', '.').replace('\\', '.').replace('.py', '')
            if import_name.startswith(module_path):
                return True

        return False

class DocumentationManager:
    """Manages the documentation generation process."""

    def __init__(self, output_dir: str = "./pydocs"):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)

    def generate_index(self, project_context: Dict, generated_docs: List[str]):
        """Generate an index.md file linking to all documentation."""

        index_content = f"""# Project Documentation

Auto-generated documentation for Python project: `{os.path.basename(project_context['root_path'])}`

## Project Overview

- **Total Python Files**: {len(project_context['all_files'])}
- **External Dependencies**: {len(project_context['external_dependencies'])}
- **Documentation Files**: {len(generated_docs)}

## External Dependencies

{chr(10).join(f'- `{dep}`' for dep in sorted(project_context['external_dependencies']))}

## File Documentation

"""

        for doc_file in sorted(generated_docs):
            rel_path = os.path.relpath(doc_file.replace('.md', '.py'), '.')
            doc_name = os.path.basename(doc_file)
            index_content += f"- [`{rel_path}`](./{doc_name})\n"

        index_content += f"""
## Project Structure

```
{self.generate_tree_structure(project_context)}
```

---

*Documentation generated automatically using Ollama LLM*
"""

        with open(self.output_dir / "index.md", 'w', encoding='utf-8') as f:
            f.write(index_content)

    def generate_tree_structure(self, project_context: Dict, max_depth: int = 3) -> str:
        """Generate a tree-like structure of the project."""
        lines = []
        root_path = project_context['root_path']

        for py_file in sorted(project_context['all_files']):
            rel_path = os.path.relpath(py_file, root_path)
            depth = rel_path.count(os.sep)
            if depth <= max_depth:
                indent = "  " * depth
                filename = os.path.basename(rel_path)
                lines.append(f"{indent}{filename}")

        return '\n'.join(lines[:50])  # Limit output

    def sanitize_filename(self, file_path: str, root_path: str) -> str:
        """Convert file path to a safe markdown filename."""
        rel_path = os.path.relpath(file_path, root_path)
        # Replace path separators and special characters
        safe_name = rel_path.replace('\\', '/').replace('.py', '.md')
        return safe_name

def main():
    parser = argparse.ArgumentParser(description="Generate documentation for Python project using Ollama")
    parser.add_argument("path", help="Path to Python project directory")
    parser.add_argument("--model", default="deepseek-r1:latest", help="Ollama model to use (default: deepseek-r1:latest). For thinking models use 'thinking' in name")
    parser.add_argument("--thinking", action=argparse.BooleanOptionalAction, help="Does the model think", type=bool)
    parser.add_argument("--output", default="./pydocs", help="Output directory for documentation (default: ./pydocs)")
    parser.add_argument("--ollama-url", default="http://localhost:11434", help="Ollama server URL")
    parser.add_argument("--exclude", nargs="*", default=[], help="Directories to exclude from scanning")
    parser.add_argument("--max-files", type=int, default=400, help="Maximum number of files to process")

    args = parser.parse_args()

    # Validate project path
    if not os.path.exists(args.path):
        print(f"Error: Path '{args.path}' does not exist")
        sys.exit(1)

    # Initialize components
    doc_generator = OllamaDocGenerator(args.model, args.ollama_url, args.thinking)
    project_analyzer = ProjectAnalyzer(args.path)
    doc_manager = DocumentationManager(args.output)
    analyzer = PythonAnalyzer()

    # Check Ollama connection
    print("Checking Ollama connection...")
    if not doc_generator.check_ollama_connection():
        print(f"Error: Cannot connect to Ollama at {args.ollama_url}")
        print("Make sure Ollama is running: ollama serve")
        sys.exit(1)

    # Check model availability
    print(f"Checking model availability: {args.model}")
    if not doc_generator.check_model_availability():
        print(f"Error: Model '{args.model}' is not available")
        print(f"Install it with: ollama pull {args.model}")
        sys.exit(1)

    print(f"✓ Ollama connection established with model: {args.model}")

    # Scan project
    print("Scanning project...")
    project_context = project_analyzer.scan_project(args.exclude)

    if not project_context['all_files']:
        print("No Python files found in the project")
        sys.exit(1)

    print(f"Found {len(project_context['all_files'])} Python files")

    # Limit files if specified
    files_to_process = project_context['all_files'][:args.max_files]
    if len(files_to_process) < len(project_context['all_files']):
        print(f"Processing first {args.max_files} files (use --max-files to change)")

    # Generate documentation for each file
    generated_docs = []

    for i, file_path in enumerate(files_to_process, 1):
        rel_path = os.path.relpath(file_path, args.path)
        print(f"[{i}/{len(files_to_process)}] Documenting {rel_path}...")

        # Analyze file
        file_analysis = analyzer.analyze_file(file_path)
        if not file_analysis:
            print(f"  ⚠ Skipped due to analysis error")
            continue

        # Generate documentation
        documentation = doc_generator.generate_documentation(file_analysis, project_context) if len(file_analysis['content'].strip(" \n\t")) else ""
        if not documentation:
            print(f"  ⚠ Failed to generate documentation" if len(file_analysis['content'].strip(" \n\t")) else "   ⚠ No document generated because no code was found in the file")
            continue

        # Save documentation
        doc_filename = doc_manager.sanitize_filename(file_path, args.path)
        doc_path = doc_manager.output_dir / doc_filename
        os.makedirs(os.path.dirname(doc_path), exist_ok=True)
        with open(doc_path, 'w', encoding='utf-8') as f:
            f.write(documentation)

        generated_docs.append(doc_filename)
        print(f"  ✓ Generated: {doc_filename}")

    # Generate index file
    if generated_docs:
        print("Generating index file...")
        doc_manager.generate_index(project_context, generated_docs)
        print(f"✓ Documentation complete! Check {args.output}/index.md")
        print(f"Generated {len(generated_docs)} documentation files")
    else:
        print("No documentation files were generated")

if __name__ == "__main__":
    main()