Skip to content

Commit 408f0b2

Browse files
committed
Add crates index manager abstraction
The full crates.io index corpus is too large and growing too fast to realistically assume we can have access to it all at once for inference purposes. This new component enables us to keep a fixed-size cache of snapshots on disk while rotating the least used snapshots out when other snapshots are requested. We unconditionally keep the current (i.e. non-archived) index cloned given it is the newest and most likely to be requested during inference. The interface also supports loading index portions from an existing filesystem which means it can also be used transiently to mediate access to these resources.
1 parent 0f83101 commit 408f0b2

File tree

6 files changed

+1944
-172
lines changed

6 files changed

+1944
-172
lines changed

internal/safememfs/safememfs.go

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
// Copyright 2025 Google LLC
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package safememfs
5+
6+
import (
7+
"os"
8+
"sync"
9+
10+
"github.com/go-git/go-billy/v5"
11+
"github.com/go-git/go-billy/v5/memfs"
12+
)
13+
14+
// SafeMemory is a thread-safe wrapper for any billy.Filesystem.
15+
type SafeMemory struct {
16+
fs billy.Filesystem
17+
mu *sync.RWMutex
18+
}
19+
20+
// New creates a new thread-safe in-memory filesystem.
21+
func New() *SafeMemory {
22+
return &SafeMemory{
23+
fs: memfs.New(),
24+
mu: &sync.RWMutex{},
25+
}
26+
}
27+
28+
func (s *SafeMemory) Chroot(path string) (billy.Filesystem, error) {
29+
s.mu.RLock()
30+
defer s.mu.RUnlock()
31+
newFs, err := s.fs.Chroot(path)
32+
if err != nil {
33+
return nil, err
34+
}
35+
return &SafeMemory{
36+
fs: newFs,
37+
mu: s.mu, // NOTE: same mutex
38+
}, nil
39+
}
40+
41+
func (s *SafeMemory) Root() string {
42+
return "/"
43+
}
44+
45+
// --- Write Operations (use exclusive Lock) ---
46+
47+
func (s *SafeMemory) Create(filename string) (billy.File, error) {
48+
s.mu.Lock()
49+
defer s.mu.Unlock()
50+
return s.fs.Create(filename)
51+
}
52+
53+
func (s *SafeMemory) OpenFile(filename string, flag int, perm os.FileMode) (billy.File, error) {
54+
s.mu.Lock()
55+
defer s.mu.Unlock()
56+
return s.fs.OpenFile(filename, flag, perm)
57+
}
58+
59+
func (s *SafeMemory) MkdirAll(path string, perm os.FileMode) error {
60+
s.mu.Lock()
61+
defer s.mu.Unlock()
62+
return s.fs.MkdirAll(path, perm)
63+
}
64+
65+
func (s *SafeMemory) Rename(from, to string) error {
66+
s.mu.Lock()
67+
defer s.mu.Unlock()
68+
return s.fs.Rename(from, to)
69+
}
70+
71+
func (s *SafeMemory) Remove(filename string) error {
72+
s.mu.Lock()
73+
defer s.mu.Unlock()
74+
return s.fs.Remove(filename)
75+
}
76+
77+
func (s *SafeMemory) Symlink(target, link string) error {
78+
s.mu.Lock()
79+
defer s.mu.Unlock()
80+
return s.fs.Symlink(target, link)
81+
}
82+
83+
func (s *SafeMemory) TempFile(dir, prefix string) (billy.File, error) {
84+
s.mu.Lock()
85+
defer s.mu.Unlock()
86+
return s.fs.TempFile(dir, prefix)
87+
}
88+
89+
// --- Read Operations (use shared RLock) ---
90+
91+
func (s *SafeMemory) Open(filename string) (billy.File, error) {
92+
s.mu.RLock()
93+
defer s.mu.RUnlock()
94+
return s.fs.OpenFile(filename, os.O_RDONLY, 0)
95+
}
96+
97+
func (s *SafeMemory) Stat(filename string) (os.FileInfo, error) {
98+
s.mu.RLock()
99+
defer s.mu.RUnlock()
100+
return s.fs.Stat(filename)
101+
}
102+
103+
func (s *SafeMemory) Lstat(filename string) (os.FileInfo, error) {
104+
s.mu.RLock()
105+
defer s.mu.RUnlock()
106+
return s.fs.Lstat(filename)
107+
}
108+
109+
func (s *SafeMemory) ReadDir(path string) ([]os.FileInfo, error) {
110+
s.mu.RLock()
111+
defer s.mu.RUnlock()
112+
return s.fs.ReadDir(path)
113+
}
114+
115+
func (s *SafeMemory) Readlink(link string) (string, error) {
116+
s.mu.RLock()
117+
defer s.mu.RUnlock()
118+
return s.fs.Readlink(link)
119+
}
120+
121+
func (s *SafeMemory) Join(elem ...string) string {
122+
// No lock needed as this is a pure function
123+
return s.fs.Join(elem...)
124+
}
125+
126+
// Capabilities forwards the call to the underlying filesystem.
127+
// No lock needed as it doesn't modify state.
128+
func (s *SafeMemory) Capabilities() billy.Capability {
129+
if capable, ok := s.fs.(billy.Capable); ok {
130+
return capable.Capabilities()
131+
}
132+
// Default capabilities if the underlying fs doesn't implement billy.Capable
133+
return 0
134+
}
135+
136+
// Ensure the wrapper implements the full interface.
137+
var _ billy.Filesystem = (*SafeMemory)(nil)
138+
var _ billy.Capable = (*SafeMemory)(nil)
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
// Copyright 2025 Google LLC
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package index
5+
6+
import (
7+
"context"
8+
"regexp"
9+
"sort"
10+
11+
"github.com/go-git/go-billy/v5"
12+
"github.com/go-git/go-git/v5"
13+
"github.com/go-git/go-git/v5/config"
14+
"github.com/go-git/go-git/v5/plumbing"
15+
"github.com/go-git/go-git/v5/plumbing/cache"
16+
"github.com/go-git/go-git/v5/storage/filesystem"
17+
"github.com/pkg/errors"
18+
)
19+
20+
var (
21+
currentIndexURL = "https://github.com/rust-lang/crates.io-index.git"
22+
archiveIndexURL = "https://github.com/rust-lang/crates.io-index-archive.git"
23+
)
24+
25+
var snapshotBranchRegex = regexp.MustCompile(`^refs/heads/snapshot-(\d{4}-\d{2}-\d{2})$`)
26+
27+
// ListAvailableSnapshots queries the archive repository for available snapshots
28+
// Snapshots are returned as their associated RFC3339 date e.g. 2025-06-14.
29+
func ListAvailableSnapshots(ctx context.Context) ([]string, error) {
30+
// Create a remote reference to list branches
31+
rem := git.NewRemote(nil, &config.RemoteConfig{URLs: []string{archiveIndexURL}})
32+
// List the references
33+
refs, err := rem.ListContext(ctx, &git.ListOptions{})
34+
if err != nil {
35+
return nil, errors.Wrap(err, "failed to list remote refs")
36+
}
37+
var snapshots []string
38+
for _, ref := range refs {
39+
if matches := snapshotBranchRegex.FindStringSubmatch(ref.Name().String()); matches != nil {
40+
snapshots = append(snapshots, matches[1])
41+
}
42+
}
43+
sort.Strings(snapshots)
44+
return snapshots, nil
45+
}
46+
47+
// Fetcher defines how to fetch and update a repository index
48+
type Fetcher interface {
49+
// Fetch clones the repository into the given filesystem
50+
Fetch(ctx context.Context, fs billy.Filesystem) error
51+
// Update updates an existing repository in the filesystem
52+
Update(ctx context.Context, fs billy.Filesystem) error
53+
}
54+
55+
// CurrentIndexFetcher fetches the current crates.io index
56+
type CurrentIndexFetcher struct{}
57+
58+
func (f *CurrentIndexFetcher) Fetch(ctx context.Context, fs billy.Filesystem) error {
59+
storer := filesystem.NewStorage(fs, cache.NewObjectLRUDefault())
60+
_, err := git.CloneContext(ctx, storer, nil, &git.CloneOptions{
61+
URL: currentIndexURL,
62+
ReferenceName: plumbing.Master,
63+
SingleBranch: true,
64+
NoCheckout: true,
65+
})
66+
if err != nil {
67+
return errors.Wrap(err, "failed to clone current index")
68+
}
69+
// Nice-to-have: Set HEAD to track the remote since it will remain up-to-date on Update.
70+
remoteMain := plumbing.NewRemoteReferenceName(git.DefaultRemoteName, "master")
71+
err = storer.SetReference(plumbing.NewSymbolicReference(plumbing.HEAD, remoteMain))
72+
if err != nil {
73+
return errors.Wrap(err, "failed to configure HEAD")
74+
}
75+
return nil
76+
}
77+
78+
func (f *CurrentIndexFetcher) Update(ctx context.Context, fs billy.Filesystem) error {
79+
storer := filesystem.NewStorage(fs, cache.NewObjectLRUDefault())
80+
repo, err := git.Open(storer, nil)
81+
if err != nil {
82+
return errors.Wrap(err, "failed to open repository")
83+
}
84+
err = repo.FetchContext(ctx, &git.FetchOptions{Force: true})
85+
if err == git.NoErrAlreadyUpToDate {
86+
return nil
87+
} else if err != nil {
88+
return errors.Wrap(err, "failed to fetch updates")
89+
}
90+
return nil
91+
}
92+
93+
// SnapshotIndexFetcher fetches a specific snapshot branch
94+
type SnapshotIndexFetcher struct {
95+
Date string
96+
}
97+
98+
func (f *SnapshotIndexFetcher) Fetch(ctx context.Context, fs billy.Filesystem) error {
99+
storer := filesystem.NewStorage(fs, cache.NewObjectLRUDefault())
100+
_, err := git.CloneContext(ctx, storer, nil, &git.CloneOptions{
101+
URL: archiveIndexURL,
102+
ReferenceName: plumbing.NewBranchReferenceName("snapshot-" + f.Date),
103+
SingleBranch: true,
104+
NoCheckout: true,
105+
})
106+
if err != nil {
107+
return errors.Wrapf(err, "failed to clone snapshot %s", f.Date)
108+
}
109+
return nil
110+
}
111+
112+
func (f *SnapshotIndexFetcher) Update(ctx context.Context, fs billy.Filesystem) error {
113+
// Snapshots are immutable
114+
return nil
115+
}

0 commit comments

Comments
 (0)