Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 21 additions & 5 deletions toolchain/base/value_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#ifndef CARBON_TOOLCHAIN_BASE_VALUE_STORE_H_
#define CARBON_TOOLCHAIN_BASE_VALUE_STORE_H_

#include <concepts>
#include <memory>
#include <type_traits>
#include <utility>
Expand Down Expand Up @@ -37,10 +38,12 @@ template <class IdT>
class ValueStoreRange;

// Common calculation for ValueStore types.
template <typename IdT, typename ValueT = IdT::ValueType>
template <typename IdT, typename ValueT = IdT::ValueType,
typename KeyT = ValueT>
class ValueStoreTypes {
public:
using ValueType = std::decay_t<ValueT>;
using KeyType = std::decay_t<KeyT>;

// Typically we want to use `ValueType&` and `const ValueType& to avoid
// copies, but when the value type is a `StringRef`, we assume external
Expand All @@ -53,6 +56,14 @@ class ValueStoreTypes {
llvm::StringRef, const ValueType&>;
};

// If `IdT` provides a distinct `IdT::KeyType`, default to that for the key
// type.
template <typename IdT>
requires(!std::same_as<typename IdT::ValueType, typename IdT::KeyType>)
class ValueStoreTypes<IdT>
: public ValueStoreTypes<IdT, typename IdT::ValueType,
typename IdT::KeyType> {};

// A simple wrapper for accumulating values, providing IDs to later retrieve the
// value. This does not do deduplication.
//
Expand Down Expand Up @@ -221,11 +232,16 @@ class ValueStoreRange {
// A wrapper for accumulating immutable values with deduplication, providing IDs
// to later retrieve the value.
//
// IdT::ValueType must represent the type being indexed.
// `IdT::ValueType` must represent the type being indexed.
//
// `IdT::KeyType` can optionally be present, and if so is used for the argument
// to `Lookup`. It must be valid to use both `KeyType` and `ValueType` as lookup
// types in the underlying `Set`.
template <typename IdT>
class CanonicalValueStore {
public:
using ValueType = ValueStoreTypes<IdT>::ValueType;
using KeyType = ValueStoreTypes<IdT>::KeyType;
using RefType = ValueStoreTypes<IdT>::RefType;
using ConstRefType = ValueStoreTypes<IdT>::ConstRefType;

Expand All @@ -237,7 +253,7 @@ class CanonicalValueStore {

// Looks up the canonical ID for a value, or returns `None` if not in the
// store.
auto Lookup(ValueType value) const -> IdT;
auto Lookup(KeyType key) const -> IdT;

// Reserves space.
auto Reserve(size_t size) -> void;
Expand Down Expand Up @@ -290,8 +306,8 @@ auto CanonicalValueStore<IdT>::Add(ValueType value) -> IdT {
}

template <typename IdT>
auto CanonicalValueStore<IdT>::Lookup(ValueType value) const -> IdT {
if (auto result = set_.Lookup(value, KeyContext(&values_))) {
auto CanonicalValueStore<IdT>::Lookup(KeyType key) const -> IdT {
if (auto result = set_.Lookup(key, KeyContext(&values_))) {
return result.key();
}
return IdT::None;
Expand Down
11 changes: 4 additions & 7 deletions toolchain/check/import_cpp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -350,9 +350,8 @@ static auto AsCarbonNamespace(Context& context,
auto& clang_decls = context.sem_ir().clang_decls();

// Check if the decl context is already mapped to a Carbon namespace.
if (auto context_clang_decl_id = clang_decls.Lookup(
{.decl = clang::dyn_cast<clang::Decl>(decl_context),
.inst_id = SemIR::InstId::None});
if (auto context_clang_decl_id =
clang_decls.Lookup(clang::dyn_cast<clang::Decl>(decl_context));
context_clang_decl_id.has_value()) {
return clang_decls.Get(context_clang_decl_id).inst_id;
}
Expand All @@ -365,8 +364,7 @@ static auto AsCarbonNamespace(Context& context,
decl_contexts.push_back(decl_context);
decl_context = decl_context->getParent();
parent_decl_id =
clang_decls.Lookup({.decl = clang::dyn_cast<clang::Decl>(decl_context),
.inst_id = SemIR::InstId::None});
clang_decls.Lookup(clang::dyn_cast<clang::Decl>(decl_context));
} while (!parent_decl_id.has_value());

// We know the parent of the last decl context is mapped, map the rest.
Expand Down Expand Up @@ -532,8 +530,7 @@ static auto MapRecordType(Context& context, SemIR::LocId loc_id,
if (record_decl && !record_decl->isUnion()) {
auto& clang_decls = context.sem_ir().clang_decls();
SemIR::InstId record_inst_id = SemIR::InstId::None;
if (auto record_clang_decl_id = clang_decls.Lookup(
{.decl = record_decl, .inst_id = SemIR::InstId::None});
if (auto record_clang_decl_id = clang_decls.Lookup(record_decl);
record_clang_decl_id.has_value()) {
record_inst_id = clang_decls.Get(record_clang_decl_id).inst_id;
} else {
Expand Down
34 changes: 26 additions & 8 deletions toolchain/sem_ir/clang_decl.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,22 @@ class Decl;
namespace Carbon::SemIR {

// A Clang declaration mapped to a Carbon instruction.
// Using custom hashing since the declaration is keyed by the `decl` member for
// lookup.
// TODO: Avoid custom hashing by either having the data structure support keying
// or create a dedicated mapping. See
// https://discord.com/channels/655572317891461132/768530752592805919/1384999468293947537
//
// Note that Clang's AST uses address-identity for nodes, which means the
// pointer is the canonical way to represent a specific AST node and is expected
// to be sufficient for comparison, hashing, etc.
struct ClangDecl : public Printable<ClangDecl> {
auto Print(llvm::raw_ostream& out) const -> void;

friend auto CarbonHashtableEq(const ClangDecl& lhs, const ClangDecl& rhs)
-> bool {
return HashtableEq(lhs.decl, rhs.decl);
// Equality comparison uses the address-identity property of the Clang AST and
// just compares the `decl` pointers. We canonicalize on these to avoid ever
// having different instructions for the same declaration.
auto operator==(const ClangDecl& rhs) const -> bool {
return decl == rhs.decl;
}
// Support direct comparison with the Clang AST node pointer.
auto operator==(const clang::Decl* rhs_decl) const -> bool {
return decl == rhs_decl;
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC the concern raised by @geoffromer correctly, we should make ClangDecl default comparison include all the fields. I think we should be able to customize the specific store to compare only the decl field for the storage needs instead of changing ClangDecl itself.
https://discord.com/channels/655572317891461132/768530752592805919/1384999468293947537
I think that if we can provide the information of not just what is the key of the storage but also how to extract the key from the value.
This is what I've tried to do in a clunky way in #5725, but I guess I missed changing the API appropriately.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried to leave comments to explain why I don't think this is needed, also discussed some in the Discord...

The instruction ID isn't part of the identity of a declaration, it is just associated data. It's value is completely determined by the declaration pointer. Comparing them both should be redundant.

Is there an added comment that would make this more clear than the current ones?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The instruction ID isn't part of the identity of a declaration, it is just associated data. It's value is completely determined by the declaration pointer. Comparing them both should be redundant.

But this isn't an invariant of the type itself, it's an invariant of how the type is used. Relying so deeply on that invariant makes ClangDecl hard to understand in isolation -- it has to be understood in terms of its use as the value type of clang_decls (and the fact that this is the only place ClangDecl is used).

Ideally I'd prefer to decouple this type from that invariant, e.g. by teaching ValueStore how to extract decl and use it (rather than the whole ClangDecl) as a lookup and canonicalization key. However, that's probably not worth the time investment right now, and might not be worth the additional complexity in ValueStore at all.

Failing that, I think we should more explicitly document the coupling with clang_decls, and I've added a suggestion to that end.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But this isn't an invariant of the type itself, it's an invariant of how the type is used. Relying so deeply on that invariant makes ClangDecl hard to understand in isolation -- it has to be understood in terms of its use as the value type of clang_decls (and the fact that this is the only place ClangDecl is used).

FWIW, I think that is true regardless of the inclusion of inst_id -- I don't think this struct is reasonable to understand in isolation, I think it is fundamentally an implementation detail of the value store and how we're putting clang::Decl*s into a value store.

[...] by teaching ValueStore how to extract decl and use it (rather than the whole ClangDecl) as a lookup and canonicalization key.

The CarbonHashValue and operator== definitions, IMO, are how you teach ValueStore what to use for canonicalization.

And what this PR does is extend that to also support teaching ValueStore about this for lookup. I feel like after this PR, we have done what you're suggesting here.

But all of that is in the context of this being coupled to the value store, not some independently useful type.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I accepted your comment, but to your point this is really about the core of the ClangDecl class, so lifted it up to the class comment and reworded it a bit to try to make this more clear and explicit. PTAL.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm merging to unblock other work, but if you'd like any further changes to the comments here, happy to make them in follow-up PRs.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment changes look good to me. My only hesitation is that as I understand it, the hash/equality customization will break if ClangDecls associated with different CanonicalValueStores are intermingled, but the comment doesn't call that out. However, in context that situation may be too unlikely to be worth commenting on.


// Hashing for ClangDecl. See common/hashing.h.
Expand All @@ -45,15 +50,28 @@ struct ClangDecl : public Printable<ClangDecl> {
clang::Decl* decl = nullptr;

// The instruction the Clang declaration is mapped to.
//
// This is stored along side the `decl` pointer to avoid having to lookup both
// the pointer and the instruction ID in two separate areas of storage.
InstId inst_id;
};

// The ID of a `ClangDecl`.
//
// These IDs are importantly distinct from the `inst_id` associated with each
// declaration. These form a dense range of IDs that is used to reference the
// AST node pointers without storing those pointers directly into SemIR and
// needing space to hold a full pointer. We can't avoid having these IDs without
// embedding pointers directly into the storage of SemIR as part of an
// instruction.
struct ClangDeclId : public IdBase<ClangDeclId> {
static constexpr llvm::StringLiteral Label = "clang_decl_id";

using ValueType = ClangDecl;

// Use the AST node pointer directly when doing `Lookup` to find an ID.
using KeyType = clang::Decl*;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think @jonmeow raised a concern that we should change the ID as it could be used in multiple value stores, but perhaps just defining the key type is less of an issue.
https://discord.com/channels/655572317891461132/768530752592805919/1387492664458612938

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Happy to hear Jon's thoughts here -- the ID type already dictates the value type, and even the reference type, so it seemed reasonable to also dictate the key type... Not sure what the looser coupling would look like?

Copy link
Contributor

@jonmeow jonmeow Jun 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bricknerb You were putting value-specific functions here (inside the ID), I think this approach is different.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note in part, I expect KeyType here is something we can deal with by template parameters overriding it. When the hash function of the value is specifically associated with the ID, that starts making it harder to replace -- I tend to think of switching types through template parameters as easier that switching functions discovered through type parameters; adding/swapping a type parameter is easy, but discovering a function through one type when the function is actually for a different type is a bit more of a design issue.


using IdBase::IdBase;
};

Expand Down
Loading