Skip to content

Commit b3193f2

Browse files
pafi-codexrmx
andauthored
feat(http): add error handling for exporting (open-telemetry#4709)
* feat(http): add error handling for exporting * feat(http_exporter): allow to run retry loop on connection errors * feat(http): change error types that are caught * refactor(http): introduce variables to unify logging * feat(http_exporter): only retry on connection error * test(http_exporter): add test case for connection errors while exporting * refactor(http): simplify if statements * docs(changelog): add changes * fix(http_exporter): use correct class after rebase * docs(changelog): update changelog * refactor(http_exporter): add empty space in logs * docs(tests): remove comments * refactor(tests): simplify tests * Update CHANGELOG.md * Update exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py * Apply suggestions from code review --------- Co-authored-by: Riccardo Magliocchetti <riccardo.magliocchetti@gmail.com>
1 parent 62e9ad3 commit b3193f2

File tree

7 files changed

+183
-28
lines changed

7 files changed

+183
-28
lines changed

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
2626
([#4847](https://github.com/open-telemetry/opentelemetry-python/pull/4847))
2727
- Prevent possible endless recursion from happening in `SimpleLogRecordProcessor.on_emit`,
2828
([#4799](https://github.com/open-telemetry/opentelemetry-python/pull/4799)) and ([#4867](https://github.com/open-telemetry/opentelemetry-python/pull/4867)).
29+
- `opentelemetry-exporter-otlp-proto-http`: fix retry logic and error handling for connection failures in trace, metric, and log exporters
30+
([#4709](https://github.com/open-telemetry/opentelemetry-python/pull/4709))
2931

3032
## Version 1.39.0/0.60b0 (2025-12-03)
3133

@@ -93,7 +95,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
9395
([#4654](https://github.com/open-telemetry/opentelemetry-python/pull/4654)).
9496
- Fix type checking for built-in metric exporters
9597
([#4820](https://github.com/open-telemetry/opentelemetry-python/pull/4820))
96-
98+
9799
## Version 1.38.0/0.59b0 (2025-10-16)
98100

99101
- Add `rstcheck` to pre-commit to stop introducing invalid RST

exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -186,26 +186,42 @@ def export(
186186
serialized_data = encode_logs(batch).SerializeToString()
187187
deadline_sec = time() + self._timeout
188188
for retry_num in range(_MAX_RETRYS):
189-
resp = self._export(serialized_data, deadline_sec - time())
190-
if resp.ok:
191-
return LogRecordExportResult.SUCCESS
192189
# multiplying by a random number between .8 and 1.2 introduces a +/20% jitter to each backoff.
193190
backoff_seconds = 2**retry_num * random.uniform(0.8, 1.2)
191+
try:
192+
resp = self._export(serialized_data, deadline_sec - time())
193+
if resp.ok:
194+
return LogRecordExportResult.SUCCESS
195+
except requests.exceptions.RequestException as error:
196+
reason = error
197+
retryable = isinstance(error, ConnectionError)
198+
status_code = None
199+
else:
200+
reason = resp.reason
201+
retryable = _is_retryable(resp)
202+
status_code = resp.status_code
203+
204+
if not retryable:
205+
_logger.error(
206+
"Failed to export logs batch code: %s, reason: %s",
207+
status_code,
208+
reason,
209+
)
210+
return LogRecordExportResult.FAILURE
211+
194212
if (
195-
not _is_retryable(resp)
196-
or retry_num + 1 == _MAX_RETRYS
213+
retry_num + 1 == _MAX_RETRYS
197214
or backoff_seconds > (deadline_sec - time())
198215
or self._shutdown
199216
):
200217
_logger.error(
201-
"Failed to export logs batch code: %s, reason: %s",
202-
resp.status_code,
203-
resp.text,
218+
"Failed to export logs batch due to timeout, "
219+
"max retries or shutdown."
204220
)
205221
return LogRecordExportResult.FAILURE
206222
_logger.warning(
207223
"Transient error %s encountered while exporting logs batch, retrying in %.2fs.",
208-
resp.reason,
224+
reason,
209225
backoff_seconds,
210226
)
211227
shutdown = self._shutdown_is_occuring.wait(backoff_seconds)

exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -231,26 +231,41 @@ def export(
231231
serialized_data = encode_metrics(metrics_data).SerializeToString()
232232
deadline_sec = time() + self._timeout
233233
for retry_num in range(_MAX_RETRYS):
234-
resp = self._export(serialized_data, deadline_sec - time())
235-
if resp.ok:
236-
return MetricExportResult.SUCCESS
237234
# multiplying by a random number between .8 and 1.2 introduces a +/20% jitter to each backoff.
238235
backoff_seconds = 2**retry_num * random.uniform(0.8, 1.2)
236+
try:
237+
resp = self._export(serialized_data, deadline_sec - time())
238+
if resp.ok:
239+
return MetricExportResult.SUCCESS
240+
except requests.exceptions.RequestException as error:
241+
reason = error
242+
retryable = isinstance(error, ConnectionError)
243+
status_code = None
244+
else:
245+
reason = resp.reason
246+
retryable = _is_retryable(resp)
247+
status_code = resp.status_code
248+
249+
if not retryable:
250+
_logger.error(
251+
"Failed to export metrics batch code: %s, reason: %s",
252+
status_code,
253+
reason,
254+
)
255+
return MetricExportResult.FAILURE
239256
if (
240-
not _is_retryable(resp)
241-
or retry_num + 1 == _MAX_RETRYS
257+
retry_num + 1 == _MAX_RETRYS
242258
or backoff_seconds > (deadline_sec - time())
243259
or self._shutdown
244260
):
245261
_logger.error(
246-
"Failed to export metrics batch code: %s, reason: %s",
247-
resp.status_code,
248-
resp.text,
262+
"Failed to export metrics batch due to timeout, "
263+
"max retries or shutdown."
249264
)
250265
return MetricExportResult.FAILURE
251266
_logger.warning(
252267
"Transient error %s encountered while exporting metrics batch, retrying in %.2fs.",
253-
resp.reason,
268+
reason,
254269
backoff_seconds,
255270
)
256271
shutdown = self._shutdown_in_progress.wait(backoff_seconds)

exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -179,26 +179,42 @@ def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
179179
serialized_data = encode_spans(spans).SerializePartialToString()
180180
deadline_sec = time() + self._timeout
181181
for retry_num in range(_MAX_RETRYS):
182-
resp = self._export(serialized_data, deadline_sec - time())
183-
if resp.ok:
184-
return SpanExportResult.SUCCESS
185182
# multiplying by a random number between .8 and 1.2 introduces a +/20% jitter to each backoff.
186183
backoff_seconds = 2**retry_num * random.uniform(0.8, 1.2)
184+
try:
185+
resp = self._export(serialized_data, deadline_sec - time())
186+
if resp.ok:
187+
return SpanExportResult.SUCCESS
188+
except requests.exceptions.RequestException as error:
189+
reason = error
190+
retryable = isinstance(error, ConnectionError)
191+
status_code = None
192+
else:
193+
reason = resp.reason
194+
retryable = _is_retryable(resp)
195+
status_code = resp.status_code
196+
197+
if not retryable:
198+
_logger.error(
199+
"Failed to export span batch code: %s, reason: %s",
200+
status_code,
201+
reason,
202+
)
203+
return SpanExportResult.FAILURE
204+
187205
if (
188-
not _is_retryable(resp)
189-
or retry_num + 1 == _MAX_RETRYS
206+
retry_num + 1 == _MAX_RETRYS
190207
or backoff_seconds > (deadline_sec - time())
191208
or self._shutdown
192209
):
193210
_logger.error(
194-
"Failed to export span batch code: %s, reason: %s",
195-
resp.status_code,
196-
resp.text,
211+
"Failed to export span batch due to timeout, "
212+
"max retries or shutdown."
197213
)
198214
return SpanExportResult.FAILURE
199215
_logger.warning(
200216
"Transient error %s encountered while exporting span batch, retrying in %.2fs.",
201-
resp.reason,
217+
reason,
202218
backoff_seconds,
203219
)
204220
shutdown = self._shutdown_in_progress.wait(backoff_seconds)

exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@
1919
from unittest import TestCase
2020
from unittest.mock import ANY, MagicMock, Mock, patch
2121

22+
import requests
2223
from requests import Session
24+
from requests.exceptions import ConnectionError
2325
from requests.models import Response
2426

2527
from opentelemetry.exporter.otlp.proto.common.metrics_encoder import (
@@ -555,6 +557,40 @@ def test_retry_timeout(self, mock_post):
555557
warning.records[0].message,
556558
)
557559

560+
@patch.object(Session, "post")
561+
def test_export_no_collector_available_retryable(self, mock_post):
562+
exporter = OTLPMetricExporter(timeout=1.5)
563+
msg = "Server not available."
564+
mock_post.side_effect = ConnectionError(msg)
565+
with self.assertLogs(level=WARNING) as warning:
566+
self.assertEqual(
567+
exporter.export(self.metrics["sum_int"]),
568+
MetricExportResult.FAILURE,
569+
)
570+
# Check for greater 2 because the request is on each retry
571+
# done twice at the moment.
572+
self.assertGreater(mock_post.call_count, 2)
573+
self.assertIn(
574+
f"Transient error {msg} encountered while exporting metrics batch, retrying in",
575+
warning.records[0].message,
576+
)
577+
578+
@patch.object(Session, "post")
579+
def test_export_no_collector_available(self, mock_post):
580+
exporter = OTLPMetricExporter(timeout=1.5)
581+
582+
mock_post.side_effect = requests.exceptions.RequestException()
583+
with self.assertLogs(level=WARNING) as warning:
584+
self.assertEqual(
585+
exporter.export(self.metrics["sum_int"]),
586+
MetricExportResult.FAILURE,
587+
)
588+
self.assertEqual(mock_post.call_count, 1)
589+
self.assertIn(
590+
"Failed to export metrics batch code",
591+
warning.records[0].message,
592+
)
593+
558594
@patch.object(Session, "post")
559595
def test_timeout_set_correctly(self, mock_post):
560596
resp = Response()

exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import requests
2525
from google.protobuf.json_format import MessageToDict
2626
from requests import Session
27+
from requests.exceptions import ConnectionError
2728
from requests.models import Response
2829

2930
from opentelemetry._logs import LogRecord, SeverityNumber
@@ -483,6 +484,40 @@ def test_retry_timeout(self, mock_post):
483484
warning.records[0].message,
484485
)
485486

487+
@patch.object(Session, "post")
488+
def test_export_no_collector_available_retryable(self, mock_post):
489+
exporter = OTLPLogExporter(timeout=1.5)
490+
msg = "Server not available."
491+
mock_post.side_effect = ConnectionError(msg)
492+
with self.assertLogs(level=WARNING) as warning:
493+
self.assertEqual(
494+
exporter.export(self._get_sdk_log_data()),
495+
LogRecordExportResult.FAILURE,
496+
)
497+
# Check for greater 2 because the request is on each retry
498+
# done twice at the moment.
499+
self.assertGreater(mock_post.call_count, 2)
500+
self.assertIn(
501+
f"Transient error {msg} encountered while exporting logs batch, retrying in",
502+
warning.records[0].message,
503+
)
504+
505+
@patch.object(Session, "post")
506+
def test_export_no_collector_available(self, mock_post):
507+
exporter = OTLPLogExporter(timeout=1.5)
508+
509+
mock_post.side_effect = requests.exceptions.RequestException()
510+
with self.assertLogs(level=WARNING) as warning:
511+
self.assertEqual(
512+
exporter.export(self._get_sdk_log_data()),
513+
LogRecordExportResult.FAILURE,
514+
)
515+
self.assertEqual(mock_post.call_count, 1)
516+
self.assertIn(
517+
"Failed to export logs batch code",
518+
warning.records[0].message,
519+
)
520+
486521
@patch.object(Session, "post")
487522
def test_timeout_set_correctly(self, mock_post):
488523
resp = Response()

exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
import requests
2222
from requests import Session
23+
from requests.exceptions import ConnectionError
2324
from requests.models import Response
2425

2526
from opentelemetry.exporter.otlp.proto.http import Compression
@@ -303,6 +304,40 @@ def test_retry_timeout(self, mock_post):
303304
warning.records[0].message,
304305
)
305306

307+
@patch.object(Session, "post")
308+
def test_export_no_collector_available_retryable(self, mock_post):
309+
exporter = OTLPSpanExporter(timeout=1.5)
310+
msg = "Server not available."
311+
mock_post.side_effect = ConnectionError(msg)
312+
with self.assertLogs(level=WARNING) as warning:
313+
self.assertEqual(
314+
exporter.export([BASIC_SPAN]),
315+
SpanExportResult.FAILURE,
316+
)
317+
# Check for greater 2 because the request is on each retry
318+
# done twice at the moment.
319+
self.assertGreater(mock_post.call_count, 2)
320+
self.assertIn(
321+
f"Transient error {msg} encountered while exporting span batch, retrying in",
322+
warning.records[0].message,
323+
)
324+
325+
@patch.object(Session, "post")
326+
def test_export_no_collector_available(self, mock_post):
327+
exporter = OTLPSpanExporter(timeout=1.5)
328+
329+
mock_post.side_effect = requests.exceptions.RequestException()
330+
with self.assertLogs(level=WARNING) as warning:
331+
self.assertEqual(
332+
exporter.export([BASIC_SPAN]),
333+
SpanExportResult.FAILURE,
334+
)
335+
self.assertEqual(mock_post.call_count, 1)
336+
self.assertIn(
337+
"Failed to export span batch code",
338+
warning.records[0].message,
339+
)
340+
306341
@patch.object(Session, "post")
307342
def test_timeout_set_correctly(self, mock_post):
308343
resp = Response()

0 commit comments

Comments
 (0)