Skip to content

Commit 93a55dd

Browse files
authored
feat: Normalize git repository URLs for API queries on the new API (#3986)
For #3830 Change the new `AffectedVersions` entities to store a normalized Git repository URL so that queries with the new API logic so that queries for tags aren't dependent on the exact repository URL: - remove the protocol/scheme - There are currently only 16 unique repositories in OSV (test instance) that don't use the `https://` scheme, and only 3[^1] of these repos have vulns with both `http://` & `https://` - remove the `.git` extension - Mostly, OSS-Fuzz and CURL uses GitHub repos with the `.git` extension, while our CVE's do not. I will need to do a re-put of all the GIT records in the test instance to repopulate the names in the `AffectedVersions` entities. This doesn't yet fix the issue on production - it's too complicated to try fix with the current/old querying logic, it'll just be fixed when the migration is complete. [^1]: git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git ([http](https://osv.dev/vulnerability/CVE-2019-19352), [https](https://osv.dev/vulnerability/CVE-2023-32255)) git.musl-libc.org/git/musl ([http](https://osv.dev/vulnerability/CVE-2017-15650), [https](https://osv.dev/vulnerability/CVE-2025-26519)) git.savannah.gnu.org/git/wget.git ([http](https://osv.dev/vulnerability/CVE-2016-7098), [https](https://osv.dev/vulnerability/CVE-2018-20483))
1 parent d124dea commit 93a55dd

File tree

5 files changed

+83
-73
lines changed

5 files changed

+83
-73
lines changed

gcp/api/server.py

Lines changed: 2 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
import time
2727
import concurrent.futures
2828
from typing import Callable
29-
from urllib.parse import urlparse
3029

3130
from collections import defaultdict
3231

@@ -110,38 +109,6 @@
110109
# ----
111110

112111

113-
def _normalize_git_repo_url(repo_url: str) -> str:
114-
"""Normalize git repository URL for matching by removing protocol/scheme.
115-
116-
This enables matching git repositories regardless of whether they use
117-
http, https, git, or other protocols. For example:
118-
- http://git.musl-libc.org/git/musl
119-
- https://git.musl-libc.org/git/musl
120-
- git://git.musl-libc.org/git/musl
121-
122-
Will all normalize to: git.musl-libc.org/git/musl
123-
124-
Args:
125-
repo_url: The git repository URL to normalize
126-
127-
Returns:
128-
The normalized URL without protocol/scheme
129-
"""
130-
if not repo_url:
131-
return repo_url
132-
133-
try:
134-
parsed = urlparse(repo_url)
135-
# Remove scheme and reconstruct without it
136-
# Keep netloc (hostname) and path
137-
normalized = parsed.netloc + parsed.path
138-
139-
# Remove trailing slash
140-
return normalized.rstrip('/')
141-
except Exception:
142-
return repo_url
143-
144-
145112
def ndb_context(func):
146113
"""Wrapper to create an NDB context."""
147114

@@ -1082,8 +1049,8 @@ def _is_version_affected(affected_packages,
10821049

10831050
# Normalize both URLs for comparison to handle protocol differences
10841051
# (http vs https vs git://, etc.)
1085-
normalized_package_name = _normalize_git_repo_url(package_name)
1086-
normalized_repo_url = _normalize_git_repo_url(repo_url)
1052+
normalized_package_name = osv.normalize_repo_package(package_name)
1053+
normalized_repo_url = osv.normalize_repo_package(repo_url)
10871054

10881055
if normalized_package_name != normalized_repo_url:
10891056
continue

gcp/api/server_new.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,12 @@ def query_package(context,
6161
if not package_name:
6262
return []
6363

64-
query = osv.AffectedVersions.query(osv.AffectedVersions.name == package_name)
64+
query = osv.AffectedVersions.query(
65+
osv.AffectedVersions.name.IN([
66+
package_name,
67+
# Also query the normalized name in case this is a GIT repo.
68+
osv.normalize_repo_package(package_name)
69+
]))
6570
if ecosystem:
6671
query = query.filter(osv.AffectedVersions.ecosystem == ecosystem)
6772
query = query.order(osv.AffectedVersions.vuln_id)
@@ -83,7 +88,7 @@ def query_package(context,
8388
affected: osv.AffectedVersions = it.next()
8489
if affected.vuln_id == last_matched_id:
8590
continue
86-
if not version or affected_affects(version, affected):
91+
if not version or affected_affects(package_name, version, affected):
8792
if include_details:
8893
bugs.append(get_vuln_async(affected.vuln_id))
8994
else:
@@ -94,8 +99,16 @@ def query_package(context,
9499
return bugs
95100

96101

97-
def affected_affects(version: str, affected: osv.AffectedVersions) -> bool:
102+
def affected_affects(name: str, version: str,
103+
affected: osv.AffectedVersions) -> bool:
98104
"""Check if a given version is affected by the AffectedVersions entry."""
105+
# Make sure the package name correctly matches this entity.
106+
if affected.ecosystem != 'GIT' and name != affected.name:
107+
return False
108+
if (affected.ecosystem == 'GIT' and
109+
osv.normalize_repo_package(name) != affected.name):
110+
return False
111+
99112
if len(affected.versions) > 0:
100113
return _match_versions(version, affected)
101114
if len(affected.events) > 0:

gcp/api/server_test.py

Lines changed: 1 addition & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
import unittest
1717

18-
from server import should_skip_bucket, _normalize_git_repo_url
18+
from server import should_skip_bucket
1919

2020

2121
class ServerTest(unittest.TestCase):
@@ -41,38 +41,6 @@ def test_should_skip_bucket(self):
4141
for path, expected in test_cases:
4242
self.assertEqual(expected, should_skip_bucket(path))
4343

44-
def test_normalize_git_repo_url(self):
45-
"""Test _normalize_git_repo_url function."""
46-
test_cases = [
47-
# protocol normalization
48-
('http://git.musl-libc.org/git/musl', 'git.musl-libc.org/git/musl'),
49-
('https://git.musl-libc.org/git/musl', 'git.musl-libc.org/git/musl'),
50-
('git://git.musl-libc.org/git/musl', 'git.musl-libc.org/git/musl'),
51-
52-
# github examples
53-
('http://github.com/user/repo', 'github.com/user/repo'),
54-
('https://github.com/user/repo', 'github.com/user/repo'),
55-
('git://github.com/user/repo', 'github.com/user/repo'),
56-
57-
# trailing slash
58-
('https://github.com/user/repo/', 'github.com/user/repo'),
59-
('http://git.example.com/path/', 'git.example.com/path'),
60-
61-
# .git suffix preserved
62-
('https://github.com/user/repo.git', 'github.com/user/repo.git'),
63-
('http://git.example.com/repo.git', 'git.example.com/repo.git'),
64-
65-
# edge cases
66-
('', ''),
67-
('invalid-url', 'invalid-url'),
68-
('http://', ''),
69-
('https://hostname', 'hostname'),
70-
]
71-
72-
for repo_url, expected in test_cases:
73-
with self.subTest(repo_url=repo_url):
74-
self.assertEqual(expected, _normalize_git_repo_url(repo_url))
75-
7644

7745
if __name__ == '__main__':
7846
unittest.main()

osv/models.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1212,7 +1212,7 @@ def affected_from_bug(entity: Bug) -> list[AffectedVersions]:
12121212
AffectedVersions(
12131213
vuln_id=entity.db_id,
12141214
ecosystem='GIT',
1215-
name=repo_url,
1215+
name=normalize_repo_package(repo_url),
12161216
versions=affected.versions,
12171217
))
12181218

@@ -1245,6 +1245,36 @@ def diff_affected_versions(
12451245
return added, removed
12461246

12471247

1248+
def normalize_repo_package(repo_url: str) -> str:
1249+
"""Normalize the repo_url for use with GIT AffectedVersions entities.
1250+
1251+
Removes the scheme/protocol and the .git extension, and trailing slashes.
1252+
1253+
For example:
1254+
- 'http://git.musl-libc.org/git/musl' (e.g. CVE-2017-15650)
1255+
and 'https://git.musl-libc.org/git/musl' (e.g. CVE-2025-26519)
1256+
both become 'git.musl-libc.org/git/musl'
1257+
- 'https://github.com/curl/curl.git' (e.g. CURL-CVE-2024-2004)
1258+
and 'https://github.com/curl/curl' (e.g. CVE-2025-5025)
1259+
both become 'github.com/curl/curl'
1260+
"""
1261+
if not repo_url:
1262+
return repo_url
1263+
1264+
try:
1265+
parsed = urlparse(repo_url)
1266+
# Remove scheme and reconstruct without it
1267+
# Keep netloc (hostname) and path
1268+
normalized = parsed.netloc + parsed.path
1269+
1270+
# Remove trailing slash
1271+
normalized = normalized.rstrip('/')
1272+
normalized = normalized.removesuffix('.git')
1273+
return normalized
1274+
except Exception:
1275+
return repo_url
1276+
1277+
12481278
# --- Indexer entities ---
12491279

12501280

osv/models_test.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ def test_bug_post_put(self):
174174
models.AffectedVersions(
175175
vuln_id=vuln_id,
176176
ecosystem='GIT',
177-
name='https://github.com/test/test',
177+
name='github.com/test/test',
178178
versions=['v1', 'v2']),
179179
models.AffectedVersions(
180180
vuln_id=vuln_id,
@@ -375,6 +375,38 @@ def test_oss_fuzz_private(self):
375375
blob = bucket.get_blob(os.path.join(gcs.VULN_PB_PATH, f'{vuln_id}.pb'))
376376
self.assertIsNone(blob)
377377

378+
def test_normalize_repo(self):
379+
"""Test normalize_repo_package function."""
380+
test_cases = [
381+
# protocol normalization
382+
('http://git.musl-libc.org/git/musl', 'git.musl-libc.org/git/musl'),
383+
('https://git.musl-libc.org/git/musl', 'git.musl-libc.org/git/musl'),
384+
('git://git.musl-libc.org/git/musl', 'git.musl-libc.org/git/musl'),
385+
386+
# github examples
387+
('http://github.com/user/repo', 'github.com/user/repo'),
388+
('https://github.com/user/repo', 'github.com/user/repo'),
389+
('git://github.com/user/repo', 'github.com/user/repo'),
390+
391+
# trailing slash
392+
('https://github.com/user/repo/', 'github.com/user/repo'),
393+
('http://git.example.com/path/', 'git.example.com/path'),
394+
395+
# .git suffix removed
396+
('https://github.com/user/repo.git', 'github.com/user/repo'),
397+
('http://git.example.com/repo.git', 'git.example.com/repo'),
398+
399+
# edge cases
400+
('', ''),
401+
('invalid-url', 'invalid-url'),
402+
('http://', ''),
403+
('https://hostname', 'hostname'),
404+
]
405+
406+
for repo_url, expected in test_cases:
407+
with self.subTest(repo_url=repo_url):
408+
self.assertEqual(expected, models.normalize_repo_package(repo_url))
409+
378410

379411
def setUpModule():
380412
"""Set up the test module."""

0 commit comments

Comments
 (0)