Skip to content

Commit 05fd7bb

Browse files
authored
chore: dynamically import backend library to reduce memory footprint and startup time (baserow#4315)
* lazy imports * Fix tests * Add warning checks for libraries that should be lazy imported * Address feedback
1 parent 22f9c3c commit 05fd7bb

File tree

36 files changed

+435
-163
lines changed

36 files changed

+435
-163
lines changed

backend/src/baserow/config/asgi.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44

55
from channels.routing import ProtocolTypeRouter, URLRouter
66

7-
from baserow.config.helpers import ConcurrencyLimiterASGI
8-
from baserow.core.mcp import baserow_mcp
7+
from baserow.config.helpers import ConcurrencyLimiterASGI, check_lazy_loaded_libraries
8+
from baserow.core.mcp import get_baserow_mcp_server
99
from baserow.core.telemetry.telemetry import setup_logging, setup_telemetry
1010
from baserow.ws.routers import websocket_router
1111

@@ -18,13 +18,16 @@
1818
# logging setup. Otherwise Django will try to destroy and log handlers we added prior.
1919
setup_logging()
2020

21+
# Check that libraries meant to be lazy-loaded haven't been imported at startup.
22+
# This runs after Django is fully loaded, so it catches imports from all apps.
23+
check_lazy_loaded_libraries()
2124

2225
application = ProtocolTypeRouter(
2326
{
2427
"http": ConcurrencyLimiterASGI(
2528
URLRouter(
2629
[
27-
re_path(r"^mcp", baserow_mcp.sse_app()),
30+
re_path(r"^mcp", get_baserow_mcp_server().sse_app()),
2831
re_path(r"", django_asgi_app),
2932
]
3033
),

backend/src/baserow/config/celery.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
1+
from django.conf import settings
2+
13
from celery import Celery, signals
24

5+
from baserow.config.helpers import check_lazy_loaded_libraries
36
from baserow.core.telemetry.tasks import BaserowTelemetryTask
47

58
app = Celery("baserow")
@@ -26,3 +29,13 @@ def clear_local(*args, **kwargs):
2629

2730
signals.task_prerun.connect(clear_local)
2831
signals.task_postrun.connect(clear_local)
32+
33+
34+
@signals.worker_process_init.connect
35+
def on_worker_init(**kwargs):
36+
# This is only needed in asgi.py
37+
settings.BASEROW_LAZY_LOADED_LIBRARIES.append("mcp")
38+
39+
# Check that libraries meant to be lazy-loaded haven't been imported at startup.
40+
# This runs after Django is fully loaded, so it catches imports from all apps.
41+
check_lazy_loaded_libraries()

backend/src/baserow/config/helpers.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,54 @@
11
import asyncio
2+
import sys
3+
4+
from django.conf import settings
25

36
from loguru import logger
47

58

9+
def check_lazy_loaded_libraries():
10+
"""
11+
Check if any libraries that should be lazy-loaded have been imported at startup.
12+
13+
This function checks sys.modules against settings.BASEROW_LAZY_LOADED_LIBRARIES
14+
and emits a warning if any of them have been loaded prematurely. This helps
15+
catch accidental top-level imports that defeat the purpose of lazy loading
16+
these heavy libraries to reduce memory footprint.
17+
18+
Only runs when DEBUG is True.
19+
"""
20+
21+
if not settings.DEBUG:
22+
return
23+
24+
lazy_libs = getattr(settings, "BASEROW_LAZY_LOADED_LIBRARIES", [])
25+
loaded_early = []
26+
27+
for lib in lazy_libs:
28+
if lib in sys.modules:
29+
loaded_early.append(lib)
30+
31+
if loaded_early:
32+
libs_list = ", ".join(f'"{lib}"' for lib in loaded_early)
33+
logger.warning(
34+
f"The following libraries were loaded during startup but should be "
35+
f"lazy-loaded to reduce memory footprint: {', '.join(loaded_early)}. "
36+
f"Either import them inside functions/methods where they're used, or "
37+
f"remove them from BASEROW_LAZY_LOADED_LIBRARIES if they're legitimately "
38+
f"needed at startup. "
39+
f"To debug, add the following code at the very top of your settings file "
40+
f"(e.g., settings/dev.py, before any other imports):\n\n"
41+
f"import sys, traceback\n"
42+
f"class _T:\n"
43+
f" def find_module(self, n, p=None):\n"
44+
f" for lib in [{libs_list}]:\n"
45+
f" if n == lib or n.startswith(lib + '.'):\n"
46+
f" print(f'IMPORT: {{n}}'); traceback.print_stack(); sys.exit(1)\n"
47+
f" return None\n"
48+
f"sys.meta_path.insert(0, _T())\n"
49+
)
50+
51+
652
class dummy_context:
753
async def __aenter__(self):
854
pass

backend/src/baserow/config/settings/base.py

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,7 @@
1111
from django.core.exceptions import ImproperlyConfigured
1212

1313
import dj_database_url
14-
import sentry_sdk
1514
from corsheaders.defaults import default_headers
16-
from sentry_sdk.integrations.django import DjangoIntegration
17-
from sentry_sdk.scrubber import DEFAULT_DENYLIST, EventScrubber
1815

1916
from baserow.config.settings.utils import (
2017
Setting,
@@ -1303,18 +1300,42 @@ def __setitem__(self, key, value):
13031300
print(e)
13041301

13051302

1303+
# Libraries that should be lazy-loaded (imported inside functions/methods) to reduce
1304+
# memory footprint at startup. If any of these are found in sys.modules during startup,
1305+
# a warning will be shown suggesting to either lazy-load them or remove them from this
1306+
# list if they're legitimately needed at startup.
1307+
BASEROW_LAZY_LOADED_LIBRARIES = [
1308+
"openai",
1309+
"anthropic",
1310+
"mistralai",
1311+
"ollama",
1312+
"langchain_core",
1313+
"jira2markdown",
1314+
"saml2",
1315+
"openpyxl",
1316+
"numpy",
1317+
]
1318+
1319+
13061320
SENTRY_BACKEND_DSN = os.getenv("SENTRY_BACKEND_DSN")
13071321
SENTRY_DSN = SENTRY_BACKEND_DSN or os.getenv("SENTRY_DSN")
1308-
SENTRY_DENYLIST = DEFAULT_DENYLIST + ["username", "email", "name"]
13091322

13101323
if SENTRY_DSN:
1324+
import sentry_sdk
1325+
from sentry_sdk.integrations.django import DjangoIntegration
1326+
from sentry_sdk.scrubber import DEFAULT_DENYLIST, EventScrubber
1327+
1328+
SENTRY_DENYLIST = DEFAULT_DENYLIST + ["username", "email", "name"]
1329+
13111330
sentry_sdk.init(
13121331
dsn=SENTRY_DSN,
13131332
integrations=[DjangoIntegration(signals_spans=False, middleware_spans=False)],
13141333
send_default_pii=False,
13151334
event_scrubber=EventScrubber(recursive=True, denylist=SENTRY_DENYLIST),
13161335
environment=os.getenv("SENTRY_ENVIRONMENT", ""),
13171336
)
1337+
else:
1338+
BASEROW_LAZY_LOADED_LIBRARIES.append("sentry_sdk")
13181339

13191340
BASEROW_OPENAI_API_KEY = os.getenv("BASEROW_OPENAI_API_KEY", None)
13201341
BASEROW_OPENAI_ORGANIZATION = os.getenv("BASEROW_OPENAI_ORGANIZATION", "") or None

backend/src/baserow/config/settings/dev.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,11 @@
2020
INSTALLED_APPS.insert(0, "daphne") # noqa: F405
2121
INSTALLED_APPS += ["django_extensions"] # noqa: F405
2222

23+
# daphne imports numpy via autobahn -> flatbuffers, so we exclude it from the
24+
# lazy-load check in dev mode. In production, numpy should still be lazy-loaded.
25+
if "numpy" in BASEROW_LAZY_LOADED_LIBRARIES: # noqa: F405
26+
BASEROW_LAZY_LOADED_LIBRARIES.remove("numpy") # noqa: F405
27+
2328
BASEROW_ENABLE_SILK = str_to_bool(os.getenv("BASEROW_ENABLE_SILK", "on"))
2429
if BASEROW_ENABLE_SILK:
2530
INSTALLED_APPS += ["silk"] # noqa: F405

backend/src/baserow/config/wsgi.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,10 @@
77
https://docs.djangoproject.com/en/2.2/howto/deployment/wsgi/
88
"""
99

10+
from django.conf import settings
1011
from django.core.wsgi import get_wsgi_application
1112

13+
from baserow.config.helpers import check_lazy_loaded_libraries
1214
from baserow.core.telemetry.telemetry import setup_logging, setup_telemetry
1315

1416
# The telemetry instrumentation library setup needs to run prior to django's setup.
@@ -19,3 +21,10 @@
1921
# It is critical to setup our own logging after django has been setup and done its own
2022
# logging setup. Otherwise Django will try to destroy and log handlers we added prior.
2123
setup_logging()
24+
25+
# This is only needed in asgi.py
26+
settings.BASEROW_LAZY_LOADED_LIBRARIES.append("mcp")
27+
28+
# Check that libraries meant to be lazy-loaded haven't been imported at startup.
29+
# This runs after Django is fully loaded, so it catches imports from all apps.
30+
check_lazy_loaded_libraries()

backend/src/baserow/contrib/database/mcp/rows/tools.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
from asgiref.sync import sync_to_async
2-
from mcp import Tool
3-
from mcp.types import TextContent
42
from rest_framework.response import Response
53
from starlette.status import HTTP_204_NO_CONTENT
64

@@ -23,6 +21,8 @@ class ListRowsMcpTool(MCPTool):
2321
name = "list_table_rows"
2422

2523
async def list(self, endpoint):
24+
from mcp import Tool
25+
2626
return [
2727
Tool(
2828
name=self.name,
@@ -64,6 +64,8 @@ async def call(
6464
name_parameters,
6565
call_arguments,
6666
):
67+
from mcp.types import TextContent
68+
6769
table_id = call_arguments["table_id"]
6870
if not await sync_to_async(table_in_workspace_of_endpoint)(endpoint, table_id):
6971
return [TextContent(type="text", text="Table not in endpoint workspace.")]
@@ -92,6 +94,8 @@ class CreateRowMcpTool(MCPTool):
9294
name = "create_row_table_{id}"
9395

9496
async def list(self, endpoint):
97+
from mcp import Tool
98+
9599
tables = await sync_to_async(get_all_tables)(endpoint)
96100
tables = await sync_to_async(remove_table_no_permission)(
97101
endpoint, tables, CreateRowDatabaseTableOperationType
@@ -127,6 +131,8 @@ async def call(
127131
name_parameters,
128132
call_arguments,
129133
):
134+
from mcp.types import TextContent
135+
130136
table_id = name_parameters["id"]
131137
if not await sync_to_async(table_in_workspace_of_endpoint)(endpoint, table_id):
132138
return [TextContent(type="text", text="Table not in endpoint workspace.")]
@@ -148,6 +154,8 @@ class UpdateRowMcpTool(MCPTool):
148154
name = "update_row_table_{id}"
149155

150156
async def list(self, endpoint):
157+
from mcp import Tool
158+
151159
tables = await sync_to_async(get_all_tables)(endpoint)
152160
tables = await sync_to_async(remove_table_no_permission)(
153161
endpoint, tables, UpdateDatabaseRowOperationType
@@ -187,6 +195,8 @@ async def call(
187195
name_parameters,
188196
call_arguments,
189197
):
198+
from mcp.types import TextContent
199+
190200
table_id = name_parameters["id"]
191201
if not await sync_to_async(table_in_workspace_of_endpoint)(endpoint, table_id):
192202
return [TextContent(type="text", text="Table not in endpoint workspace.")]
@@ -211,6 +221,8 @@ class DeleteRowMcpTool(MCPTool):
211221
name = "delete_table_row"
212222

213223
async def list(self, endpoint):
224+
from mcp import Tool
225+
214226
return [
215227
Tool(
216228
name=self.name,
@@ -241,6 +253,8 @@ async def call(
241253
name_parameters,
242254
call_arguments,
243255
):
256+
from mcp.types import TextContent
257+
244258
table_id = call_arguments["table_id"]
245259
if not await sync_to_async(table_in_workspace_of_endpoint)(endpoint, table_id):
246260
return [TextContent(type="text", text="Table not in endpoint workspace.")]

backend/src/baserow/contrib/database/mcp/table/tools.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import json
22

33
from asgiref.sync import sync_to_async
4-
from mcp import Tool
5-
from mcp.types import TextContent
64

75
from baserow.contrib.database.api.tables.serializers import (
86
TableWithoutDataSyncSerializer,
@@ -16,6 +14,8 @@ class ListTablesMcpTool(MCPTool):
1614
name = "list_tables"
1715

1816
async def list(self, endpoint):
17+
from mcp import Tool
18+
1919
return [
2020
Tool(
2121
name=self.name,
@@ -34,6 +34,8 @@ async def call(
3434
name_parameters,
3535
call_arguments,
3636
):
37+
from mcp.types import TextContent
38+
3739
tables = await sync_to_async(get_all_tables)(endpoint)
3840
serializer = TableWithoutDataSyncSerializer(tables, many=True)
3941
table_json = json.dumps(serializer.data)

backend/src/baserow/contrib/integrations/ai/service_types.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33

44
from django.contrib.auth.models import AbstractUser
55

6-
from langchain_core.exceptions import OutputParserException
7-
from langchain_core.prompts import PromptTemplate
86
from rest_framework import serializers
97
from rest_framework.exceptions import ValidationError as DRFValidationError
108

@@ -18,7 +16,7 @@
1816
)
1917
from baserow.core.generative_ai.registries import generative_ai_model_type_registry
2018
from baserow.core.integrations.handler import IntegrationHandler
21-
from baserow.core.output_parsers import StrictEnumOutputParser
19+
from baserow.core.output_parsers import get_strict_enum_output_parser
2220
from baserow.core.services.dispatch_context import DispatchContext
2321
from baserow.core.services.exceptions import (
2422
ServiceImproperlyConfiguredDispatchException,
@@ -170,6 +168,9 @@ def dispatch_data(
170168
resolved_values: Dict[str, Any],
171169
dispatch_context: DispatchContext,
172170
) -> Dict[str, Any]:
171+
from langchain_core.exceptions import OutputParserException
172+
from langchain_core.prompts import PromptTemplate
173+
173174
if not service.ai_generative_ai_type:
174175
raise ServiceImproperlyConfiguredDispatchException(
175176
"The AI provider type is missing."
@@ -228,7 +229,7 @@ def dispatch_data(
228229
choices_enum = enum.Enum(
229230
"Choices", {f"OPTION_{i}": choice for i, choice in enumerate(choices)}
230231
)
231-
output_parser = StrictEnumOutputParser(enum=choices_enum)
232+
output_parser = get_strict_enum_output_parser(enum=choices_enum)
232233
format_instructions = output_parser.get_format_instructions()
233234
prompt_template = PromptTemplate(
234235
template=prompt + "\n\nGiven this user query:\n\n{format_instructions}",

0 commit comments

Comments
 (0)