Skip to content

Commit 7e03df6

Browse files
authored
fix: AI field now correctly handles small text files (baserow#5082) (baserow#5150)
* fix: AI field now correctly handles small text files (baserow#5082) Small uploadable files (<=10KB) are inlined as TextContent instead of being uploaded via the Files API, fixing an issue where models failed to associate very small uploaded files with the prompt. Non-UTF-8 files fall back to BinaryContent. Also fixes visible_name resolution and notifies the model about skipped files. Bumps pydantic-ai-slim to 1.77.0 for TextContent support. * address copilot feedback
1 parent adcfdac commit 7e03df6

7 files changed

Lines changed: 238 additions & 41 deletions

File tree

backend/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ dependencies = [
9595
"genson==1.3.0",
9696
"pyotp==2.9.0",
9797
"qrcode==8.2",
98-
"pydantic-ai-slim[anthropic,bedrock,google,groq,mistral,openai]==1.66.0",
98+
"pydantic-ai-slim[anthropic,bedrock,google,groq,mistral,openai]==1.77.0",
9999
"opentelemetry-sdk>=1.20.0",
100100
"netifaces==0.11.0",
101101
"requests-futures>=1.0.2",

backend/src/baserow/core/generative_ai/generative_ai_model_types.py

Lines changed: 74 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55

66
from django.conf import settings
77

8+
from loguru import logger
9+
810
from baserow.core.models import Workspace
911

1012
from .registries import GenerativeAIModelType
@@ -124,23 +126,76 @@ def get_base_url(
124126
".xlsx",
125127
".xls",
126128
}
127-
_MAX_EMBED_PAYLOAD_BYTES = 45 * 1024 * 1024 # 50 MB minus some headroom
129+
# https://developers.openai.com/api/docs/guides/file-inputs
130+
_MAX_EMBED_PAYLOAD_BYTES = 45 * 1024 * 1024 # 50 MB minus headroom
128131
_MAX_EMBEDS_PER_REQUEST = 500
132+
# Below this limit, uploadable files are sent inline.
133+
_INLINE_UPLOAD_THRESHOLD_BYTES = 10 * 1024 # 10 KB
129134

130135
def _get_max_upload_bytes(self) -> int:
131136
return (
132137
min(512, settings.BASEROW_OPENAI_UPLOADED_FILE_SIZE_LIMIT_MB) * 1024 * 1024
133138
)
134139

140+
def _can_embed(self, file_size: int, embed_count: int, embed_payload: int) -> bool:
141+
return (
142+
embed_count < self._MAX_EMBEDS_PER_REQUEST
143+
and embed_payload + file_size <= self._MAX_EMBED_PAYLOAD_BYTES
144+
)
145+
146+
@staticmethod
147+
def _embed(ai_file: "AIFile", data: bytes) -> None:
148+
from pydantic_ai import BinaryContent
149+
150+
ai_file.content = BinaryContent(
151+
data=data,
152+
media_type=ai_file.mime_type,
153+
identifier=ai_file.original_name,
154+
)
155+
156+
@staticmethod
157+
def _inline_text(ai_file: "AIFile", data: bytes) -> bool:
158+
"""Try to inline file content as TextContent. Returns False if the
159+
content is not valid UTF-8."""
160+
161+
from pydantic_ai import TextContent
162+
163+
try:
164+
text = data.decode("utf-8")
165+
except (UnicodeDecodeError, ValueError):
166+
return False
167+
ai_file.content = TextContent(
168+
content=(
169+
f"[Content of file '{ai_file.original_name}']\n{text}\n[End of file]"
170+
),
171+
metadata={"source": ai_file.original_name},
172+
)
173+
return True
174+
175+
def _upload(
176+
self,
177+
ai_file: "AIFile",
178+
data: bytes,
179+
workspace: Optional[Workspace] = None,
180+
settings_override: Optional[dict[str, Any]] = None,
181+
) -> None:
182+
from pydantic_ai import UploadedFile
183+
184+
file_id = self._upload_file(ai_file.name, data, workspace, settings_override)
185+
ai_file.provider_file_id = file_id
186+
ai_file.content = UploadedFile(
187+
file_id=file_id,
188+
provider_name="openai",
189+
media_type=ai_file.mime_type,
190+
identifier=ai_file.original_name,
191+
)
192+
135193
def prepare_files(
136194
self,
137195
files: list[AIFile],
138196
workspace: Optional[Workspace] = None,
139197
settings_override: Optional[dict[str, Any]] = None,
140198
) -> list[AIFile]:
141-
from loguru import logger
142-
from pydantic_ai import BinaryContent, UploadedFile
143-
144199
embed_payload = 0
145200
embed_count = 0
146201
max_upload = self._get_max_upload_bytes()
@@ -151,38 +206,29 @@ def prepare_files(
151206

152207
try:
153208
if ext in self._EMBEDDABLE_EXTENSIONS:
154-
if (
155-
embed_count >= self._MAX_EMBEDS_PER_REQUEST
156-
or embed_payload + ai_file.size > self._MAX_EMBED_PAYLOAD_BYTES
157-
):
209+
if not self._can_embed(ai_file.size, embed_count, embed_payload):
158210
continue
159-
data = ai_file.read_content()
160-
ai_file.content = BinaryContent(
161-
data=data,
162-
media_type=ai_file.mime_type,
163-
identifier=ai_file.original_name,
164-
)
211+
self._embed(ai_file, ai_file.read_content())
165212
embed_payload += ai_file.size
166213
embed_count += 1
167214

168215
elif ext in self._UPLOADABLE_EXTENSIONS:
169216
if ai_file.size > max_upload:
170217
continue
171218
data = ai_file.read_content()
172-
file_id = self._upload_file(
173-
ai_file.name, data, workspace, settings_override
174-
)
175-
ai_file.provider_file_id = file_id
176-
ai_file.content = UploadedFile(
177-
file_id=file_id,
178-
provider_name="openai",
179-
media_type=ai_file.mime_type,
180-
identifier=ai_file.original_name,
181-
)
182-
except Exception:
183-
logger.warning(
184-
f"Skipping file {ai_file.name}: failed to read or upload."
185-
)
219+
220+
if (
221+
ai_file.size <= self._INLINE_UPLOAD_THRESHOLD_BYTES
222+
and self._can_embed(ai_file.size, embed_count, embed_payload)
223+
):
224+
if not self._inline_text(ai_file, data):
225+
self._embed(ai_file, data)
226+
embed_payload += ai_file.size
227+
embed_count += 1
228+
else:
229+
self._upload(ai_file, data, workspace, settings_override)
230+
except Exception as exc:
231+
logger.warning(f"Skipping file {ai_file.name}: {exc}")
186232
continue
187233

188234
return [f for f in files if f.content is not None]

backend/src/baserow/core/generative_ai/registries.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ def _build_user_prompt(
187187
if content:
188188
prompt = (
189189
f"{prompt}\n\n"
190-
"The contents of the attached files are included below. "
190+
"The following file contents are provided for context. "
191191
"Use them to answer the prompt above."
192192
)
193193
return [prompt] + content

backend/tests/baserow/core/generative_ai/test_generative_ai_model_types.py

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,24 @@
1+
from unittest.mock import patch
2+
3+
from pydantic_ai import BinaryContent, TextContent, UploadedFile
4+
15
from baserow.core.generative_ai.generative_ai_model_types import (
26
OpenAIGenerativeAIModelType,
37
)
8+
from baserow_premium.fields.ai_file import AIFile
9+
10+
11+
def _make_ai_file(
12+
name: str, size: int, mime_type: str = "text/plain", content_bytes: bytes = b""
13+
) -> AIFile:
14+
ai_file = AIFile(
15+
name=name,
16+
original_name=name,
17+
size=size,
18+
mime_type=mime_type,
19+
)
20+
ai_file.read_content = lambda: content_bytes # type: ignore[assignment]
21+
return ai_file
422

523

624
def test_openai_supports_files():
@@ -34,3 +52,118 @@ def test_openai_max_upload_size(settings):
3452

3553
settings.BASEROW_OPENAI_UPLOADED_FILE_SIZE_LIMIT_MB = 100
3654
assert ai_model_type._get_max_upload_bytes() == 100 * 1024 * 1024
55+
56+
57+
def test_prepare_files_small_text_file_is_inlined():
58+
"""A small .txt file should be inlined as TextContent, not uploaded."""
59+
60+
ai_model_type = OpenAIGenerativeAIModelType()
61+
data = b"talk about hamburger"
62+
ai_file = _make_ai_file("a.txt", size=len(data), content_bytes=data)
63+
64+
result = ai_model_type.prepare_files([ai_file])
65+
66+
assert len(result) == 1
67+
assert isinstance(result[0].content, TextContent)
68+
assert "talk about hamburger" in result[0].content.content
69+
assert "a.txt" in result[0].content.content
70+
assert result[0].provider_file_id is None
71+
72+
73+
def test_prepare_files_small_binary_uploadable_is_embedded():
74+
"""A small non-UTF-8 uploadable file falls back to BinaryContent."""
75+
76+
ai_model_type = OpenAIGenerativeAIModelType()
77+
data = b"\x80\x81\x82"
78+
ai_file = _make_ai_file(
79+
"data.csv", size=len(data), mime_type="text/csv", content_bytes=data
80+
)
81+
82+
result = ai_model_type.prepare_files([ai_file])
83+
84+
assert len(result) == 1
85+
assert isinstance(result[0].content, BinaryContent)
86+
assert result[0].provider_file_id is None
87+
88+
89+
def test_prepare_files_large_uploadable_is_uploaded():
90+
"""A .txt file over the inline threshold should be uploaded via the Files API."""
91+
92+
ai_model_type = OpenAIGenerativeAIModelType()
93+
size = ai_model_type._INLINE_UPLOAD_THRESHOLD_BYTES + 1
94+
data = b"x" * size
95+
ai_file = _make_ai_file("big.txt", size=size, content_bytes=data)
96+
97+
with patch.object(ai_model_type, "_upload_file", return_value="file-123"):
98+
result = ai_model_type.prepare_files([ai_file])
99+
100+
assert len(result) == 1
101+
assert isinstance(result[0].content, UploadedFile)
102+
assert result[0].provider_file_id == "file-123"
103+
104+
105+
def test_prepare_files_small_uploadable_respects_embed_limits():
106+
"""When embed payload would exceed the limit, small files fall back to upload."""
107+
108+
ai_model_type = OpenAIGenerativeAIModelType()
109+
data = b"small"
110+
ai_file = _make_ai_file("a.txt", size=len(data), content_bytes=data)
111+
112+
# Pretend we already used up the embed budget by setting the limit to 0.
113+
original = ai_model_type._MAX_EMBED_PAYLOAD_BYTES
114+
ai_model_type._MAX_EMBED_PAYLOAD_BYTES = 0
115+
try:
116+
with patch.object(ai_model_type, "_upload_file", return_value="file-456"):
117+
result = ai_model_type.prepare_files([ai_file])
118+
finally:
119+
ai_model_type._MAX_EMBED_PAYLOAD_BYTES = original
120+
121+
assert len(result) == 1
122+
assert isinstance(result[0].content, UploadedFile)
123+
assert result[0].provider_file_id == "file-456"
124+
125+
126+
def test_prepare_files_image_still_embedded():
127+
"""Images should still go through the embeddable path as before."""
128+
129+
ai_model_type = OpenAIGenerativeAIModelType()
130+
data = b"\x89PNG\r\n\x1a\n"
131+
ai_file = _make_ai_file(
132+
"photo.png", size=len(data), mime_type="image/png", content_bytes=data
133+
)
134+
135+
result = ai_model_type.prepare_files([ai_file])
136+
137+
assert len(result) == 1
138+
assert isinstance(result[0].content, BinaryContent)
139+
assert result[0].provider_file_id is None
140+
141+
142+
def test_prepare_files_unsupported_extension_is_skipped():
143+
"""Files with unsupported extensions are excluded from the result."""
144+
145+
ai_model_type = OpenAIGenerativeAIModelType()
146+
data = b"some data"
147+
ai_file = _make_ai_file(
148+
"video.mp4", size=len(data), mime_type="video/mp4", content_bytes=data
149+
)
150+
151+
result = ai_model_type.prepare_files([ai_file])
152+
153+
assert len(result) == 0
154+
assert ai_file.content is None
155+
156+
157+
def test_prepare_files_oversized_uploadable_is_skipped(settings):
158+
"""Uploadable files exceeding the size limit are excluded."""
159+
160+
ai_model_type = OpenAIGenerativeAIModelType()
161+
settings.BASEROW_OPENAI_UPLOADED_FILE_SIZE_LIMIT_MB = 1
162+
limit = ai_model_type._get_max_upload_bytes()
163+
data = b"x" * (limit + 1)
164+
ai_file = _make_ai_file("huge.txt", size=len(data), content_bytes=data)
165+
166+
result = ai_model_type.prepare_files([ai_file])
167+
168+
assert len(result) == 0
169+
assert ai_file.content is None

backend/uv.lock

Lines changed: 10 additions & 10 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{
2+
"type": "bug",
3+
"message": "Fix AI field not correctly handling small text files",
4+
"issue_origin": "github",
5+
"issue_number": 5082,
6+
"domain": "database",
7+
"bullet_points": [],
8+
"created_at": "2026-04-08"
9+
}

0 commit comments

Comments
 (0)