monkeyoffice-api-docs/analyze_html.py at main · tkaufmann/monkeyoffice-api-docs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/usr/bin/env python3
# ABOUTME: Analyzes HTML structure to identify API endpoints and sections
# ABOUTME: Extracts headings and creates structured endpoint mapping

import re
from bs4 import BeautifulSoup
import json

def analyze_html_structure(html_file):
    """Analyze HTML structure to identify endpoints and sections."""
    with open(html_file, 'r', encoding='utf-8') as f:
        html_content = f.read()

    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract title and version
    title = soup.find('title')
    if title:
        print(f"Title: {title.get_text()}")

    # Find all headings to understand structure
    headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

    endpoints = []
    current_section = None

    for heading in headings:
        text = heading.get_text().strip()
        level = int(heading.name[1])

        # Skip empty headings
        if not text:
            continue

        # Create filename-safe version
        filename = re.sub(r'[^\w\s-]', '', text.lower())
        filename = re.sub(r'[-\s]+', '-', filename).strip('-')

        endpoint_info = {
            'title': text,
            'level': level,
            'filename': f"{filename}.md",
            'element_id': heading.get('id', ''),
            'class': heading.get('class', [])
        }

        endpoints.append(endpoint_info)

        if level <= 2:
            current_section = text

        print(f"H{level}: {text} -> {filename}.md")

    return endpoints

if __name__ == "__main__":
    endpoints = analyze_html_structure('moapi.html')

    # Save structure info
    with open('html_structure.json', 'w', encoding='utf-8') as f:
        json.dump(endpoints, f, indent=2, ensure_ascii=False)

    print(f"\nFound {len(endpoints)} headings/endpoints")