Skip to content

Commit 9e8e725

Browse files
committed
Add crates index manager abstraction
The full crates.io index corpus is too large and growing too fast to realistically assume we can have access to it all at once for inference purposes. This new component enables us to keep a fixed-size cache of snapshots on disk while rotating the least used snapshots out when other snapshots are requested. We unconditionally keep the current (i.e. non-archived) index cloned given it is the newest and most likely to be requested during inference. The interface also supports loading index portions from an existing filesystem which means it can also be used transiently to mediate access to these resources. stacked-commit: true
1 parent d634056 commit 9e8e725

File tree

5 files changed

+1838
-173
lines changed

5 files changed

+1838
-173
lines changed
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
// Copyright 2025 Google LLC
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package index
5+
6+
import (
7+
"context"
8+
"regexp"
9+
"sort"
10+
11+
"github.com/go-git/go-billy/v5"
12+
"github.com/go-git/go-git/v5"
13+
"github.com/go-git/go-git/v5/config"
14+
"github.com/go-git/go-git/v5/plumbing"
15+
"github.com/go-git/go-git/v5/plumbing/cache"
16+
"github.com/go-git/go-git/v5/storage/filesystem"
17+
"github.com/pkg/errors"
18+
)
19+
20+
var (
21+
currentIndexURL = "https://github.com/rust-lang/crates.io-index.git"
22+
archiveIndexURL = "https://github.com/rust-lang/crates.io-index-archive.git"
23+
)
24+
25+
var snapshotBranchRegex = regexp.MustCompile(`^refs/heads/snapshot-(\d{4}-\d{2}-\d{2})$`)
26+
27+
// ListAvailableSnapshots queries the archive repository for available snapshots
28+
// Snapshots are returned as their associated RFC3339 date e.g. 2025-06-14.
29+
func ListAvailableSnapshots(ctx context.Context) ([]string, error) {
30+
// Create a remote reference to list branches
31+
rem := git.NewRemote(nil, &config.RemoteConfig{URLs: []string{archiveIndexURL}})
32+
// List the references
33+
refs, err := rem.ListContext(ctx, &git.ListOptions{})
34+
if err != nil {
35+
return nil, errors.Wrap(err, "failed to list remote refs")
36+
}
37+
var snapshots []string
38+
for _, ref := range refs {
39+
if matches := snapshotBranchRegex.FindStringSubmatch(ref.Name().String()); matches != nil {
40+
snapshots = append(snapshots, matches[1])
41+
}
42+
}
43+
sort.Strings(snapshots)
44+
return snapshots, nil
45+
}
46+
47+
// Fetcher defines how to fetch and update a repository index
48+
type Fetcher interface {
49+
// Fetch clones the repository into the given filesystem
50+
Fetch(ctx context.Context, fs billy.Filesystem) error
51+
// Update updates an existing repository in the filesystem
52+
Update(ctx context.Context, fs billy.Filesystem) error
53+
}
54+
55+
// CurrentIndexFetcher fetches the current crates.io index
56+
type CurrentIndexFetcher struct{}
57+
58+
func (f *CurrentIndexFetcher) Fetch(ctx context.Context, fs billy.Filesystem) error {
59+
storer := filesystem.NewStorage(fs, cache.NewObjectLRUDefault())
60+
_, err := git.CloneContext(ctx, storer, nil, &git.CloneOptions{
61+
URL: currentIndexURL,
62+
ReferenceName: plumbing.Master,
63+
SingleBranch: true,
64+
NoCheckout: true,
65+
})
66+
if err != nil {
67+
return errors.Wrap(err, "failed to clone current index")
68+
}
69+
// Nice-to-have: Set HEAD to track the remote since it will remain up-to-date on Update.
70+
remoteMain := plumbing.NewRemoteReferenceName(git.DefaultRemoteName, "master")
71+
err = storer.SetReference(plumbing.NewSymbolicReference(plumbing.HEAD, remoteMain))
72+
if err != nil {
73+
return errors.Wrap(err, "failed to configure HEAD")
74+
}
75+
return nil
76+
}
77+
78+
func (f *CurrentIndexFetcher) Update(ctx context.Context, fs billy.Filesystem) error {
79+
storer := filesystem.NewStorage(fs, cache.NewObjectLRUDefault())
80+
repo, err := git.Open(storer, nil)
81+
if err != nil {
82+
return errors.Wrap(err, "failed to open repository")
83+
}
84+
err = repo.FetchContext(ctx, &git.FetchOptions{Force: true})
85+
if err == git.NoErrAlreadyUpToDate {
86+
return nil
87+
} else if err != nil {
88+
return errors.Wrap(err, "failed to fetch updates")
89+
}
90+
return nil
91+
}
92+
93+
// SnapshotIndexFetcher fetches a specific snapshot branch
94+
type SnapshotIndexFetcher struct {
95+
Date string
96+
}
97+
98+
func (f *SnapshotIndexFetcher) Fetch(ctx context.Context, fs billy.Filesystem) error {
99+
storer := filesystem.NewStorage(fs, cache.NewObjectLRUDefault())
100+
_, err := git.CloneContext(ctx, storer, nil, &git.CloneOptions{
101+
URL: archiveIndexURL,
102+
ReferenceName: plumbing.NewBranchReferenceName("snapshot-" + f.Date),
103+
SingleBranch: true,
104+
NoCheckout: true,
105+
})
106+
if err != nil {
107+
return errors.Wrapf(err, "failed to clone snapshot %s", f.Date)
108+
}
109+
return nil
110+
}
111+
112+
func (f *SnapshotIndexFetcher) Update(ctx context.Context, fs billy.Filesystem) error {
113+
// Snapshots are immutable
114+
return nil
115+
}

0 commit comments

Comments
 (0)