diff --git a/CHANGELOG.md b/CHANGELOG.md index 7c00715e79..d3d46182ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +- `opentelemetry-instrumentation-requests`, `opentelemetry-instrumentation-wsgi`, `opentelemetry-instrumentation-asgi` Detect synthetic sources on requests, ASGI, and WSGI. + ([#3674](https://github.com/open-telemetry/opentelemetry-python-contrib/pull/3674)) + ### Added - `opentelemetry-instrumentation-aiohttp-client`: add support for url exclusions via `OTEL_PYTHON_EXCLUDED_URLS` / `OTEL_PYTHON_AIOHTTP_CLIENT_EXCLUDED_URLS` diff --git a/instrumentation/opentelemetry-instrumentation-asgi/src/opentelemetry/instrumentation/asgi/__init__.py b/instrumentation/opentelemetry-instrumentation-asgi/src/opentelemetry/instrumentation/asgi/__init__.py index bb232b39d3..fb809e6836 100644 --- a/instrumentation/opentelemetry-instrumentation-asgi/src/opentelemetry/instrumentation/asgi/__init__.py +++ b/instrumentation/opentelemetry-instrumentation-asgi/src/opentelemetry/instrumentation/asgi/__init__.py @@ -258,6 +258,9 @@ def client_response_hook(span: Span, scope: Scope, message: dict[str, Any]): from opentelemetry.instrumentation.utils import _start_internal_or_server_span from opentelemetry.metrics import get_meter from opentelemetry.propagators.textmap import Getter, Setter +from opentelemetry.semconv._incubating.attributes.user_agent_attributes import ( + USER_AGENT_SYNTHETIC_TYPE, +) from opentelemetry.semconv._incubating.metrics.http_metrics import ( create_http_server_active_requests, create_http_server_request_body_size, @@ -276,6 +279,7 @@ def client_response_hook(span: Span, scope: Scope, message: dict[str, Any]): ExcludeList, SanitizeValue, _parse_url_query, + detect_synthetic_user_agent, get_custom_headers, normalise_request_header_name, normalise_response_header_name, @@ -397,7 +401,13 @@ def collect_request_attributes( ) http_user_agent = asgi_getter.get(scope, "user-agent") if http_user_agent: - _set_http_user_agent(result, http_user_agent[0], sem_conv_opt_in_mode) + user_agent_value = http_user_agent[0] + _set_http_user_agent(result, user_agent_value, sem_conv_opt_in_mode) + + # Check for synthetic user agent type + synthetic_type = detect_synthetic_user_agent(user_agent_value) + if synthetic_type: + result[USER_AGENT_SYNTHETIC_TYPE] = synthetic_type if "client" in scope and scope["client"] is not None: _set_http_peer_ip_server( diff --git a/instrumentation/opentelemetry-instrumentation-asgi/tests/test_asgi_middleware.py b/instrumentation/opentelemetry-instrumentation-asgi/tests/test_asgi_middleware.py index 0da3014f5f..fdf328498b 100644 --- a/instrumentation/opentelemetry-instrumentation-asgi/tests/test_asgi_middleware.py +++ b/instrumentation/opentelemetry-instrumentation-asgi/tests/test_asgi_middleware.py @@ -42,6 +42,9 @@ HistogramDataPoint, NumberDataPoint, ) +from opentelemetry.semconv._incubating.attributes.user_agent_attributes import ( + USER_AGENT_SYNTHETIC_TYPE, +) from opentelemetry.semconv.attributes.client_attributes import ( CLIENT_ADDRESS, CLIENT_PORT, @@ -883,6 +886,145 @@ def update_expected_user_agent(expected): new_sem_conv=True, ) + async def test_user_agent_synthetic_bot_detection(self): + """Test that bot user agents are detected as synthetic with type 'bot'""" + test_cases = [ + b"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", + b"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)", + b"googlebot/1.0", + b"bingbot/1.0", + ] + + # Test each user agent case separately to avoid span accumulation + for user_agent in test_cases: + with self.subTest(user_agent=user_agent): + # Clear headers first + self.scope["headers"] = [] + + def update_expected_synthetic_bot( + expected, ua: bytes = user_agent + ): + expected[3]["attributes"].update( + { + SpanAttributes.HTTP_USER_AGENT: ua.decode("utf8"), + USER_AGENT_SYNTHETIC_TYPE: "bot", + } + ) + return expected + + self.scope["headers"].append([b"user-agent", user_agent]) + app = otel_asgi.OpenTelemetryMiddleware(simple_asgi) + self.seed_app(app) + await self.send_default_request() + outputs = await self.get_all_output() + self.validate_outputs( + outputs, modifiers=[update_expected_synthetic_bot] + ) + + # Clear spans after each test case to prevent accumulation + self.memory_exporter.clear() + + async def test_user_agent_synthetic_test_detection(self): + """Test that test user agents are detected as synthetic with type 'test'""" + test_cases = [ + b"alwayson/1.0", + b"AlwaysOn/2.0", + b"test-alwayson-client", + ] + + # Test each user agent case separately to avoid span accumulation + for user_agent in test_cases: + with self.subTest(user_agent=user_agent): + # Clear headers first + self.scope["headers"] = [] + + def update_expected_synthetic_test( + expected, ua: bytes = user_agent + ): + expected[3]["attributes"].update( + { + SpanAttributes.HTTP_USER_AGENT: ua.decode("utf8"), + USER_AGENT_SYNTHETIC_TYPE: "test", + } + ) + return expected + + self.scope["headers"].append([b"user-agent", user_agent]) + app = otel_asgi.OpenTelemetryMiddleware(simple_asgi) + self.seed_app(app) + await self.send_default_request() + outputs = await self.get_all_output() + self.validate_outputs( + outputs, modifiers=[update_expected_synthetic_test] + ) + + # Clear spans after each test case to prevent accumulation + self.memory_exporter.clear() + + async def test_user_agent_non_synthetic(self): + """Test that normal user agents are not marked as synthetic""" + test_cases = [ + b"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", + b"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15", + b"PostmanRuntime/7.28.4", + b"curl/7.68.0", + ] + + # Test each user agent case separately to avoid span accumulation + for user_agent in test_cases: + with self.subTest(user_agent=user_agent): + # Clear headers first + self.scope["headers"] = [] + + def update_expected_non_synthetic( + expected, ua: bytes = user_agent + ): + # Should only have the user agent, not synthetic type + expected[3]["attributes"].update( + { + SpanAttributes.HTTP_USER_AGENT: ua.decode("utf8"), + } + ) + return expected + + self.scope["headers"].append([b"user-agent", user_agent]) + app = otel_asgi.OpenTelemetryMiddleware(simple_asgi) + self.seed_app(app) + await self.send_default_request() + outputs = await self.get_all_output() + self.validate_outputs( + outputs, modifiers=[update_expected_non_synthetic] + ) + + # Clear spans after each test case to prevent accumulation + self.memory_exporter.clear() + + async def test_user_agent_synthetic_new_semconv(self): + """Test synthetic user agent detection with new semantic conventions""" + user_agent = b"Mozilla/5.0 (compatible; Googlebot/2.1)" + + def update_expected_synthetic_new_semconv(expected): + expected[3]["attributes"].update( + { + USER_AGENT_ORIGINAL: user_agent.decode("utf8"), + USER_AGENT_SYNTHETIC_TYPE: "bot", + } + ) + return expected + + self.scope["headers"] = [] + self.scope["headers"].append([b"user-agent", user_agent]) + app = otel_asgi.OpenTelemetryMiddleware(simple_asgi) + self.seed_app(app) + await self.send_default_request() + outputs = await self.get_all_output() + self.validate_outputs( + outputs, + modifiers=[update_expected_synthetic_new_semconv], + old_sem_conv=False, + new_sem_conv=True, + ) + async def test_traceresponse_header(self): """Test a traceresponse header is sent when a global propagator is set.""" diff --git a/instrumentation/opentelemetry-instrumentation-requests/src/opentelemetry/instrumentation/requests/__init__.py b/instrumentation/opentelemetry-instrumentation-requests/src/opentelemetry/instrumentation/requests/__init__.py index 7cfc3a4fee..d834c1bb6c 100644 --- a/instrumentation/opentelemetry-instrumentation-requests/src/opentelemetry/instrumentation/requests/__init__.py +++ b/instrumentation/opentelemetry-instrumentation-requests/src/opentelemetry/instrumentation/requests/__init__.py @@ -132,11 +132,17 @@ def response_hook(span, request_obj, response): ) from opentelemetry.metrics import Histogram, get_meter from opentelemetry.propagate import inject +from opentelemetry.semconv._incubating.attributes.user_agent_attributes import ( + USER_AGENT_SYNTHETIC_TYPE, +) from opentelemetry.semconv.attributes.error_attributes import ERROR_TYPE from opentelemetry.semconv.attributes.network_attributes import ( NETWORK_PEER_ADDRESS, NETWORK_PEER_PORT, ) +from opentelemetry.semconv.attributes.user_agent_attributes import ( + USER_AGENT_ORIGINAL, +) from opentelemetry.semconv.metrics import MetricInstruments from opentelemetry.semconv.metrics.http_metrics import ( HTTP_CLIENT_REQUEST_DURATION, @@ -145,6 +151,7 @@ def response_hook(span, request_obj, response): from opentelemetry.trace.span import Span from opentelemetry.util.http import ( ExcludeList, + detect_synthetic_user_agent, get_excluded_urls, parse_excluded_urls, redact_url, @@ -243,6 +250,15 @@ def get_or_create_headers(): ) _set_http_url(span_attributes, url, sem_conv_opt_in_mode) + # Check for synthetic user agent type + headers = get_or_create_headers() + user_agent = headers.get("User-Agent") + synthetic_type = detect_synthetic_user_agent(user_agent) + if synthetic_type: + span_attributes[USER_AGENT_SYNTHETIC_TYPE] = synthetic_type + if user_agent: + span_attributes[USER_AGENT_ORIGINAL] = user_agent + metric_labels = {} _set_http_method( metric_labels, @@ -297,7 +313,6 @@ def get_or_create_headers(): if callable(request_hook): request_hook(span, request) - headers = get_or_create_headers() inject(headers) with suppress_http_instrumentation(): diff --git a/instrumentation/opentelemetry-instrumentation-requests/tests/test_requests_integration.py b/instrumentation/opentelemetry-instrumentation-requests/tests/test_requests_integration.py index ac3d41294b..ac9c0529f5 100644 --- a/instrumentation/opentelemetry-instrumentation-requests/tests/test_requests_integration.py +++ b/instrumentation/opentelemetry-instrumentation-requests/tests/test_requests_integration.py @@ -63,6 +63,9 @@ SERVER_PORT, ) from opentelemetry.semconv.attributes.url_attributes import URL_FULL +from opentelemetry.semconv.attributes.user_agent_attributes import ( + USER_AGENT_ORIGINAL, +) from opentelemetry.test.mock_textmap import MockTextMapPropagator from opentelemetry.test.test_base import TestBase from opentelemetry.trace import StatusCode @@ -175,6 +178,7 @@ def test_basic(self): HTTP_METHOD: "GET", HTTP_URL: self.URL, HTTP_STATUS_CODE: 200, + USER_AGENT_ORIGINAL: "python-requests/2.32.3", }, ) @@ -211,6 +215,7 @@ def test_basic_new_semconv(self): NETWORK_PROTOCOL_VERSION: "1.1", SERVER_PORT: 80, NETWORK_PEER_PORT: 80, + USER_AGENT_ORIGINAL: "python-requests/2.32.3", }, ) @@ -253,6 +258,7 @@ def test_basic_both_semconv(self): NETWORK_PROTOCOL_VERSION: "1.1", SERVER_PORT: 80, NETWORK_PEER_PORT: 80, + USER_AGENT_ORIGINAL: "python-requests/2.32.3", }, ) @@ -276,6 +282,7 @@ def test_nonstandard_http_method(self): HTTP_METHOD: "_OTHER", HTTP_URL: self.URL, HTTP_STATUS_CODE: 405, + USER_AGENT_ORIGINAL: "python-requests/2.32.3", }, ) @@ -300,6 +307,7 @@ def test_nonstandard_http_method_new_semconv(self): NETWORK_PROTOCOL_VERSION: "1.1", ERROR_TYPE: "405", HTTP_REQUEST_METHOD_ORIGINAL: "NONSTANDARD", + USER_AGENT_ORIGINAL: "python-requests/2.32.3", }, ) self.assertIs(span.status.status_code, trace.StatusCode.ERROR) @@ -534,6 +542,7 @@ def response_hook( HTTP_URL: self.URL, HTTP_STATUS_CODE: 200, "http.response.body": "Hello!", + USER_AGENT_ORIGINAL: "python-requests/2.32.3", }, ) @@ -564,6 +573,7 @@ def test_requests_exception_without_response(self, *_, **__): { HTTP_METHOD: "GET", HTTP_URL: self.URL, + USER_AGENT_ORIGINAL: "python-requests/2.32.3", }, ) self.assertEqual(span.status.status_code, StatusCode.ERROR) @@ -591,6 +601,7 @@ def test_requests_exception_new_semconv(self, *_, **__): NETWORK_PEER_PORT: 80, NETWORK_PEER_ADDRESS: "mock", ERROR_TYPE: "RequestException", + USER_AGENT_ORIGINAL: "python-requests/2.32.3", }, ) self.assertEqual(span.status.status_code, StatusCode.ERROR) @@ -613,6 +624,7 @@ def test_requests_exception_without_proper_response_type(self, *_, **__): { HTTP_METHOD: "GET", HTTP_URL: self.URL, + USER_AGENT_ORIGINAL: "python-requests/2.32.3", }, ) self.assertEqual(span.status.status_code, StatusCode.ERROR) @@ -636,6 +648,7 @@ def test_requests_exception_with_response(self, *_, **__): HTTP_METHOD: "GET", HTTP_URL: self.URL, HTTP_STATUS_CODE: 500, + USER_AGENT_ORIGINAL: "python-requests/2.32.3", }, ) self.assertEqual(span.status.status_code, StatusCode.ERROR) @@ -675,6 +688,7 @@ def test_adapter_with_custom_response(self): "http.method": "GET", "http.url": self.URL, "http.status_code": 210, + USER_AGENT_ORIGINAL: "python-requests/2.32.3", }, ) diff --git a/instrumentation/opentelemetry-instrumentation-requests/tests/test_user_agent_synthetic.py b/instrumentation/opentelemetry-instrumentation-requests/tests/test_user_agent_synthetic.py new file mode 100644 index 0000000000..4adcc2146b --- /dev/null +++ b/instrumentation/opentelemetry-instrumentation-requests/tests/test_user_agent_synthetic.py @@ -0,0 +1,167 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import httpretty +import requests + +from opentelemetry.instrumentation.requests import RequestsInstrumentor +from opentelemetry.semconv._incubating.attributes.user_agent_attributes import ( + USER_AGENT_SYNTHETIC_TYPE, + UserAgentSyntheticTypeValues, +) +from opentelemetry.test.test_base import TestBase + + +class TestUserAgentSynthetic(TestBase): + URL = "http://mock/status/200" + + def setUp(self): + super().setUp() + RequestsInstrumentor().instrument() + httpretty.enable() + httpretty.register_uri(httpretty.GET, self.URL, body="Hello!") + + def tearDown(self): + super().tearDown() + RequestsInstrumentor().uninstrument() + httpretty.disable() + + def assert_span(self, num_spans=1): + span_list = self.memory_exporter.get_finished_spans() + self.assertEqual(num_spans, len(span_list)) + if num_spans == 0: + return None + if num_spans == 1: + return span_list[0] + return span_list + + def test_user_agent_bot_googlebot(self): + """Test that googlebot user agent is marked as 'bot'""" + headers = { + "User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" + } + requests.get(self.URL, headers=headers, timeout=5) + + span = self.assert_span() + self.assertEqual( + span.attributes.get(USER_AGENT_SYNTHETIC_TYPE), + UserAgentSyntheticTypeValues.BOT.value, + ) + + def test_user_agent_bot_bingbot(self): + """Test that bingbot user agent is marked as 'bot'""" + headers = { + "User-Agent": "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)" + } + requests.get(self.URL, headers=headers, timeout=5) + + span = self.assert_span() + self.assertEqual( + span.attributes.get(USER_AGENT_SYNTHETIC_TYPE), + UserAgentSyntheticTypeValues.BOT.value, + ) + + def test_user_agent_test_alwayson(self): + """Test that alwayson user agent is marked as 'test'""" + headers = {"User-Agent": "AlwaysOn-Monitor/1.0"} + requests.get(self.URL, headers=headers, timeout=5) + + span = self.assert_span() + self.assertEqual( + span.attributes.get(USER_AGENT_SYNTHETIC_TYPE), + UserAgentSyntheticTypeValues.TEST.value, + ) + + def test_user_agent_case_insensitive(self): + """Test that detection is case insensitive""" + headers = {"User-Agent": "GOOGLEBOT/2.1"} + requests.get(self.URL, headers=headers, timeout=5) + + span = self.assert_span() + self.assertEqual( + span.attributes.get(USER_AGENT_SYNTHETIC_TYPE), + UserAgentSyntheticTypeValues.BOT.value, + ) + + self.memory_exporter.clear() + + headers = {"User-Agent": "ALWAYSON-Monitor/1.0"} + requests.get(self.URL, headers=headers, timeout=5) + + span = self.assert_span() + self.assertEqual( + span.attributes.get(USER_AGENT_SYNTHETIC_TYPE), + UserAgentSyntheticTypeValues.TEST.value, + ) + + def test_user_agent_normal_browser(self): + """Test that normal browser user agents don't get synthetic type""" + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + } + requests.get(self.URL, headers=headers, timeout=5) + + span = self.assert_span() + self.assertNotIn(USER_AGENT_SYNTHETIC_TYPE, span.attributes) + + def test_no_user_agent_header(self): + """Test that requests without user agent don't get synthetic type""" + requests.get(self.URL, timeout=5) + + span = self.assert_span() + self.assertNotIn(USER_AGENT_SYNTHETIC_TYPE, span.attributes) + + def test_empty_user_agent_header(self): + """Test that empty user agent doesn't get synthetic type""" + headers = {"User-Agent": ""} + requests.get(self.URL, headers=headers, timeout=5) + + span = self.assert_span() + self.assertNotIn(USER_AGENT_SYNTHETIC_TYPE, span.attributes) + + def test_user_agent_substring_match(self): + """Test that substrings are detected correctly""" + # Test googlebot in middle of string + headers = {"User-Agent": "MyApp/1.0 googlebot crawler"} + requests.get(self.URL, headers=headers, timeout=5) + + span = self.assert_span() + self.assertEqual( + span.attributes.get(USER_AGENT_SYNTHETIC_TYPE), + UserAgentSyntheticTypeValues.BOT.value, + ) + + self.memory_exporter.clear() + + # Test alwayson in middle of string + headers = {"User-Agent": "TestFramework/1.0 alwayson monitoring"} + requests.get(self.URL, headers=headers, timeout=5) + + span = self.assert_span() + self.assertEqual( + span.attributes.get(USER_AGENT_SYNTHETIC_TYPE), + UserAgentSyntheticTypeValues.TEST.value, + ) + + def test_user_agent_priority_alwayson_over_bot(self): + """Test that alwayson takes priority if both patterns match""" + headers = {"User-Agent": "alwayson-googlebot/1.0"} + requests.get(self.URL, headers=headers, timeout=5) + + span = self.assert_span() + # alwayson should be checked first and return 'test' + self.assertEqual( + span.attributes.get(USER_AGENT_SYNTHETIC_TYPE), + UserAgentSyntheticTypeValues.TEST.value, + ) diff --git a/instrumentation/opentelemetry-instrumentation-wsgi/src/opentelemetry/instrumentation/wsgi/__init__.py b/instrumentation/opentelemetry-instrumentation-wsgi/src/opentelemetry/instrumentation/wsgi/__init__.py index bfbf5b6dba..1107287b68 100644 --- a/instrumentation/opentelemetry-instrumentation-wsgi/src/opentelemetry/instrumentation/wsgi/__init__.py +++ b/instrumentation/opentelemetry-instrumentation-wsgi/src/opentelemetry/instrumentation/wsgi/__init__.py @@ -258,6 +258,9 @@ def response_hook(span: Span, environ: WSGIEnvironment, status: str, response_he HTTP_SERVER_NAME, HTTP_URL, ) +from opentelemetry.semconv._incubating.attributes.user_agent_attributes import ( + USER_AGENT_SYNTHETIC_TYPE, +) from opentelemetry.semconv.attributes.error_attributes import ERROR_TYPE from opentelemetry.semconv.metrics import MetricInstruments from opentelemetry.semconv.metrics.http_metrics import ( @@ -271,6 +274,7 @@ def response_hook(span: Span, environ: WSGIEnvironment, status: str, response_he OTEL_INSTRUMENTATION_HTTP_CAPTURE_HEADERS_SERVER_RESPONSE, SanitizeValue, _parse_url_query, + detect_synthetic_user_agent, get_custom_headers, normalise_request_header_name, normalise_response_header_name, @@ -391,6 +395,11 @@ def collect_request_attributes( if user_agent is not None and len(user_agent) > 0: _set_http_user_agent(result, user_agent, sem_conv_opt_in_mode) + # Check for synthetic user agent type + synthetic_type = detect_synthetic_user_agent(user_agent) + if synthetic_type: + result[USER_AGENT_SYNTHETIC_TYPE] = synthetic_type + flavor = environ.get("SERVER_PROTOCOL", "") if flavor.upper().startswith(_HTTP_VERSION_PREFIX): flavor = flavor[len(_HTTP_VERSION_PREFIX) :] diff --git a/instrumentation/opentelemetry-instrumentation-wsgi/tests/test_wsgi_middleware.py b/instrumentation/opentelemetry-instrumentation-wsgi/tests/test_wsgi_middleware.py index 5a6e2d21f7..bb6c3aca2f 100644 --- a/instrumentation/opentelemetry-instrumentation-wsgi/tests/test_wsgi_middleware.py +++ b/instrumentation/opentelemetry-instrumentation-wsgi/tests/test_wsgi_middleware.py @@ -52,6 +52,9 @@ NET_HOST_NAME, NET_HOST_PORT, ) +from opentelemetry.semconv._incubating.attributes.user_agent_attributes import ( + USER_AGENT_SYNTHETIC_TYPE, +) from opentelemetry.semconv.attributes.http_attributes import ( HTTP_REQUEST_METHOD, HTTP_RESPONSE_STATUS_CODE, @@ -527,6 +530,7 @@ def test_default_span_name_missing_path_info(self): self.validate_response(response, span_name=method) +# pylint: disable=too-many-public-methods class TestWsgiAttributes(unittest.TestCase): def setUp(self): self.environ = {} @@ -791,6 +795,83 @@ def test_http_user_agent_attribute(self): expected_new.items(), ) + def test_http_user_agent_synthetic_bot_detection(self): + """Test that bot user agents are detected as synthetic with type 'bot'""" + test_cases = [ + "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", + "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)", + "googlebot/1.0", + "bingbot/1.0", + ] + + for user_agent in test_cases: + with self.subTest(user_agent=user_agent): + self.environ["HTTP_USER_AGENT"] = user_agent + attributes = otel_wsgi.collect_request_attributes(self.environ) + + # Should have both the original user agent and synthetic type + self.assertIn(HTTP_USER_AGENT, attributes) + self.assertEqual(attributes[HTTP_USER_AGENT], user_agent) + self.assertIn(USER_AGENT_SYNTHETIC_TYPE, attributes) + self.assertEqual(attributes[USER_AGENT_SYNTHETIC_TYPE], "bot") + + def test_http_user_agent_synthetic_test_detection(self): + """Test that test user agents are detected as synthetic with type 'test'""" + test_cases = [ + "alwayson/1.0", + "AlwaysOn/2.0", + "test-alwayson-client", + ] + + for user_agent in test_cases: + with self.subTest(user_agent=user_agent): + self.environ["HTTP_USER_AGENT"] = user_agent + attributes = otel_wsgi.collect_request_attributes(self.environ) + + # Should have both the original user agent and synthetic type + self.assertIn(HTTP_USER_AGENT, attributes) + self.assertEqual(attributes[HTTP_USER_AGENT], user_agent) + self.assertIn(USER_AGENT_SYNTHETIC_TYPE, attributes) + self.assertEqual(attributes[USER_AGENT_SYNTHETIC_TYPE], "test") + + def test_http_user_agent_non_synthetic(self): + """Test that normal user agents are not marked as synthetic""" + test_cases = [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15", + "PostmanRuntime/7.28.4", + "curl/7.68.0", + ] + + for user_agent in test_cases: + with self.subTest(user_agent=user_agent): + self.environ["HTTP_USER_AGENT"] = user_agent + attributes = otel_wsgi.collect_request_attributes(self.environ) + + # Should have the original user agent but no synthetic type + self.assertIn(HTTP_USER_AGENT, attributes) + self.assertEqual(attributes[HTTP_USER_AGENT], user_agent) + self.assertNotIn(USER_AGENT_SYNTHETIC_TYPE, attributes) + + def test_http_user_agent_synthetic_new_semconv(self): + """Test synthetic user agent detection with new semantic conventions""" + self.environ["HTTP_USER_AGENT"] = ( + "Mozilla/5.0 (compatible; Googlebot/2.1)" + ) + attributes = otel_wsgi.collect_request_attributes( + self.environ, + _StabilityMode.HTTP, + ) + + # Should have both the new semconv user agent and synthetic type + self.assertIn(USER_AGENT_ORIGINAL, attributes) + self.assertEqual( + attributes[USER_AGENT_ORIGINAL], + "Mozilla/5.0 (compatible; Googlebot/2.1)", + ) + self.assertIn(USER_AGENT_SYNTHETIC_TYPE, attributes) + self.assertEqual(attributes[USER_AGENT_SYNTHETIC_TYPE], "bot") + def test_response_attributes(self): otel_wsgi.add_response_attributes(self.span, "404 Not Found", {}) otel_wsgi.add_response_attributes( diff --git a/util/opentelemetry-util-http/src/opentelemetry/util/http/__init__.py b/util/opentelemetry-util-http/src/opentelemetry/util/http/__init__.py index 6c1403fc4f..e23e03dede 100644 --- a/util/opentelemetry-util-http/src/opentelemetry/util/http/__init__.py +++ b/util/opentelemetry-util-http/src/opentelemetry/util/http/__init__.py @@ -19,7 +19,7 @@ from re import IGNORECASE as RE_IGNORECASE from re import compile as re_compile from re import search -from typing import Callable, Iterable, overload +from typing import Callable, Iterable, Optional, overload from urllib.parse import parse_qs, urlencode, urlparse, urlunparse from opentelemetry.semconv._incubating.attributes.http_attributes import ( @@ -34,6 +34,10 @@ NET_HOST_NAME, NET_HOST_PORT, ) +from opentelemetry.semconv._incubating.attributes.user_agent_attributes import ( + UserAgentSyntheticTypeValues, +) +from opentelemetry.util.http.constants import BOT_PATTERNS, TEST_PATTERNS OTEL_INSTRUMENTATION_HTTP_CAPTURE_HEADERS_SANITIZE_FIELDS = ( "OTEL_INSTRUMENTATION_HTTP_CAPTURE_HEADERS_SANITIZE_FIELDS" @@ -301,3 +305,30 @@ def redact_url(url: str) -> str: url = remove_url_credentials(url) url = redact_query_parameters(url) return url + + +def detect_synthetic_user_agent(user_agent: Optional[str]) -> Optional[str]: + """ + Detect synthetic user agent type based on user agent string contents. + + Args: + user_agent: The user agent string to analyze + + Returns: + UserAgentSyntheticTypeValues.TEST if user agent contains any pattern from TEST_PATTERNS + UserAgentSyntheticTypeValues.BOT if user agent contains any pattern from BOT_PATTERNS + None otherwise + + Note: Test patterns take priority over bot patterns. + """ + if not user_agent: + return None + + user_agent_lower = user_agent.lower() + + if any(test_pattern in user_agent_lower for test_pattern in TEST_PATTERNS): + return UserAgentSyntheticTypeValues.TEST.value + if any(bot_pattern in user_agent_lower for bot_pattern in BOT_PATTERNS): + return UserAgentSyntheticTypeValues.BOT.value + + return None diff --git a/util/opentelemetry-util-http/src/opentelemetry/util/http/constants.py b/util/opentelemetry-util-http/src/opentelemetry/util/http/constants.py new file mode 100644 index 0000000000..e49799daca --- /dev/null +++ b/util/opentelemetry-util-http/src/opentelemetry/util/http/constants.py @@ -0,0 +1,34 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Constants for OpenTelemetry HTTP utilities. + +This module contains configuration constants and pattern definitions used +by HTTP instrumentation utilities for various features like synthetic user +agent detection. +""" + +# Test patterns to detect in user agent strings (case-insensitive) +# These patterns indicate synthetic test traffic +TEST_PATTERNS = [ + "alwayson", +] + +# Bot patterns to detect in user agent strings (case-insensitive) +# These patterns indicate automated bot traffic +BOT_PATTERNS = [ + "googlebot", + "bingbot", +] diff --git a/util/opentelemetry-util-http/tests/test_detect_synthetic_user_agent.py b/util/opentelemetry-util-http/tests/test_detect_synthetic_user_agent.py new file mode 100644 index 0000000000..2d9d3e9913 --- /dev/null +++ b/util/opentelemetry-util-http/tests/test_detect_synthetic_user_agent.py @@ -0,0 +1,88 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from opentelemetry.semconv._incubating.attributes.user_agent_attributes import ( + UserAgentSyntheticTypeValues, +) +from opentelemetry.util.http import detect_synthetic_user_agent + + +class TestDetectSyntheticUserAgent(unittest.TestCase): + def test_detect_bot_googlebot(self): + """Test detection of googlebot user agent.""" + user_agent = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" + result = detect_synthetic_user_agent(user_agent) + self.assertEqual(result, UserAgentSyntheticTypeValues.BOT.value) + + def test_detect_bot_bingbot(self): + """Test detection of bingbot user agent.""" + user_agent = "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)" + result = detect_synthetic_user_agent(user_agent) + self.assertEqual(result, UserAgentSyntheticTypeValues.BOT.value) + + def test_detect_test_alwayson(self): + """Test detection of alwayson test user agent.""" + user_agent = "AlwaysOn-Monitor/1.0" + result = detect_synthetic_user_agent(user_agent) + self.assertEqual(result, UserAgentSyntheticTypeValues.TEST.value) + + def test_case_insensitive_detection(self): + """Test that detection is case insensitive.""" + # Test uppercase patterns + user_agent_bot = "GOOGLEBOT/2.1" + result = detect_synthetic_user_agent(user_agent_bot) + self.assertEqual(result, UserAgentSyntheticTypeValues.BOT.value) + + user_agent_test = "ALWAYSON-Monitor/1.0" + result = detect_synthetic_user_agent(user_agent_test) + self.assertEqual(result, UserAgentSyntheticTypeValues.TEST.value) + + def test_normal_user_agent_not_detected(self): + """Test that normal browser user agents are not detected as synthetic.""" + user_agent = ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + ) + result = detect_synthetic_user_agent(user_agent) + self.assertIsNone(result) + + def test_none_user_agent(self): + """Test that None user agent returns None.""" + result = detect_synthetic_user_agent(None) + self.assertIsNone(result) + + def test_empty_user_agent(self): + """Test that empty user agent returns None.""" + result = detect_synthetic_user_agent("") + self.assertIsNone(result) + + def test_substring_match(self): + """Test that substrings are detected correctly.""" + # Test googlebot in middle of string + user_agent = "MyApp/1.0 googlebot crawler" + result = detect_synthetic_user_agent(user_agent) + self.assertEqual(result, UserAgentSyntheticTypeValues.BOT.value) + + # Test alwayson in middle of string + user_agent = "TestFramework/1.0 alwayson monitoring" + result = detect_synthetic_user_agent(user_agent) + self.assertEqual(result, UserAgentSyntheticTypeValues.TEST.value) + + def test_priority_test_over_bot(self): + """Test that test patterns take priority over bot patterns.""" + user_agent = "alwayson-googlebot/1.0" + result = detect_synthetic_user_agent(user_agent) + # alwayson should be checked first and return 'test' + self.assertEqual(result, UserAgentSyntheticTypeValues.TEST.value)