llm-reasoning-framework/debug_reasoning.py at main · matdev83/llm-reasoning-framework · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/usr/bin/env python3
"""
Debug script to see what DeepSeek-R1 is actually outputting
"""

import sys
import os
import logging

# Add src to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))

from llm_client import LLMClient
from llm_config import LLMConfig

# Set up logging
logging.basicConfig(level=logging.DEBUG, format='%(levelname)s [%(name)s] %(message)s')

def test_deepseek_output():
    """Test what DeepSeek-R1 actually outputs"""

    api_key = os.getenv('OPENROUTER_API_KEY')
    if not api_key:
        print("❌ Error: OPENROUTER_API_KEY environment variable not set")
        return

    llm_client = LLMClient(api_key=api_key)

    # Test prompt
    prompt = """Problem: What is 2+2?

Think step-by-step to solve this problem. When you finish your reasoning, output exactly: <REASONING_COMPLETE>

Reasoning:"""

    print("🔍 Testing DeepSeek-R1 output...")
    print("📝 Prompt:")
    print(prompt)
    print("\n" + "="*50 + "\n")

    # Test 1: Without stop token
    print("🧪 TEST 1: Without stop token")
    try:
        config = LLMConfig(
            temperature=0.1,
            max_tokens=800,
            stop=None  # No stop token
        )

        output, stats = llm_client.call(
            prompt=prompt,
            models=["deepseek/deepseek-r1-0528:free"],
            config=config
        )

        print("✅ SUCCESS!")
        print("📊 Stats:")
        print(f"   Model: {stats.model_name}")
        print(f"   Tokens: {stats.completion_tokens}")
        print(f"   Duration: {stats.call_duration_seconds:.2f}s")
        print("\n🔍 Raw Output:")
        print("="*50)
        print(repr(output))
        print("="*50)
        print(output)
        print("="*50)

        # Test reasoning extraction
        print("\n🧠 Testing reasoning extraction...")

        # Check if it contains DeepSeek-R1 thinking tags
        if "<THINKING>" in output.upper():
            print("✅ Found <THINKING> tags")
        elif "<think>" in output.lower():
            print("✅ Found <think> tags")
        else:
            print("❌ No DeepSeek thinking tags found")

        # Check if it contains the completion token
        if "<REASONING_COMPLETE>" in output:
            print("✅ Found <REASONING_COMPLETE> token")
        else:
            print("❌ No <REASONING_COMPLETE> token found")

    except Exception as e:
        print(f"❌ Error: {e}")

    print("\n" + "="*60 + "\n")

    # Test 2: With stop token
    print("🧪 TEST 2: With stop token")
    try:
        config = LLMConfig(
            temperature=0.1,
            max_tokens=800,
            stop=["<REASONING_COMPLETE>"]
        )

        output, stats = llm_client.call(
            prompt=prompt,
            models=["deepseek/deepseek-r1-0528:free"],
            config=config
        )

        print("✅ SUCCESS!")
        print("📊 Stats:")
        print(f"   Model: {stats.model_name}")
        print(f"   Tokens: {stats.completion_tokens}")
        print(f"   Duration: {stats.call_duration_seconds:.2f}s")
        print("\n🔍 Raw Output:")
        print("="*50)
        print(repr(output))
        print("="*50)
        print(output)
        print("="*50)

    except Exception as e:
        print(f"❌ Error: {e}")

    print("\n" + "="*60 + "\n")

    # Test 3: Different prompt style for DeepSeek
    print("🧪 TEST 3: DeepSeek-specific prompt")
    try:
        deepseek_prompt = """<THINKING>
Let me solve this step by step:

Problem: What is 2+2?

I need to add 2 and 2 together.
2 + 2 = 4

The answer is 4.
</THINKING>

Looking at this problem, I need to add 2 and 2 together."""

        config = LLMConfig(
            temperature=0.1,
            max_tokens=400,
            stop=None
        )

        output, stats = llm_client.call(
            prompt=deepseek_prompt,
            models=["deepseek/deepseek-r1-0528:free"],
            config=config
        )

        print("✅ SUCCESS!")
        print("📊 Stats:")
        print(f"   Model: {stats.model_name}")
        print(f"   Tokens: {stats.completion_tokens}")
        print(f"   Duration: {stats.call_duration_seconds:.2f}s")
        print("\n🔍 Raw Output:")
        print("="*50)
        print(repr(output))
        print("="*50)
        print(output)
        print("="*50)

    except Exception as e:
        print(f"❌ Error: {e}")

if __name__ == "__main__":
    test_deepseek_output()