Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions pkg/crosscluster/logical/logical_replication_job.go
Original file line number Diff line number Diff line change
Expand Up @@ -456,9 +456,11 @@ type logicalReplicationPlanner struct {
}

type logicalReplicationPlanInfo struct {
sourceSpans []roachpb.Span
partitionPgUrls []string
destTableBySrcID map[descpb.ID]dstTableMetadata
sourceSpans []roachpb.Span
partitionPgUrls []string
destTableBySrcID map[descpb.ID]dstTableMetadata
// Number of processors writing data on the destination cluster (offline or
// otherwise).
writeProcessorCount int
}

Expand Down Expand Up @@ -738,6 +740,7 @@ func (p *logicalReplicationPlanner) planOfflineInitialScan(
SQLInstanceID: instanceID,
Core: execinfrapb.ProcessorCoreUnion{LogicalReplicationOfflineScan: &spec},
})
info.writeProcessorCount++
}
}

Expand Down
59 changes: 47 additions & 12 deletions pkg/crosscluster/logical/offline_initial_scan_processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (

"github.com/cockroachdb/cockroach/pkg/backup"
"github.com/cockroachdb/cockroach/pkg/crosscluster"
"github.com/cockroachdb/cockroach/pkg/crosscluster/replicationutils"
"github.com/cockroachdb/cockroach/pkg/crosscluster/streamclient"
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
"github.com/cockroachdb/cockroach/pkg/kv/bulk"
Expand Down Expand Up @@ -65,6 +66,8 @@ type offlineInitialScanProcessor struct {

checkpointCh chan offlineCheckpoint

rangeStatsCh chan *streampb.StreamEvent_RangeStats

rekey *backup.KeyRewriter

batcher *bulk.SSTBatcher
Expand Down Expand Up @@ -104,6 +107,7 @@ func newNewOfflineInitialScanProcessor(
processorID: processorID,
stopCh: make(chan struct{}),
checkpointCh: make(chan offlineCheckpoint),
rangeStatsCh: make(chan *streampb.StreamEvent_RangeStats),
errCh: make(chan error, 1),
rekey: rekeyer,
lastKeyAdded: roachpb.Key{},
Expand Down Expand Up @@ -220,6 +224,7 @@ func (o *offlineInitialScanProcessor) Start(ctx context.Context) {
})
o.workerGroup.GoCtx(func(ctx context.Context) error {
defer close(o.checkpointCh)
defer close(o.rangeStatsCh)
pprof.Do(ctx, pprof.Labels("proc", fmt.Sprintf("%d", o.ProcessorID)), func(ctx context.Context) {
for event := range o.subscription.Events() {
if err := o.handleEvent(ctx, event); err != nil {
Expand All @@ -245,16 +250,8 @@ func (o *offlineInitialScanProcessor) Next() (rowenc.EncDatumRow, *execinfrapb.P
case checkpoint, ok := <-o.checkpointCh:
switch {
case !ok:
select {
case err := <-o.errCh:
o.MoveToDrainingAndLogError(err)
return nil, o.DrainHelper()
case <-time.After(10 * time.Second):
logcrash.ReportOrPanic(o.Ctx(), &o.FlowCtx.Cfg.Settings.SV,
"event channel closed but no error found on err channel after 10 seconds")
o.MoveToDrainingAndLogError(nil /* error */)
return nil, o.DrainHelper()
}
o.MoveToDrainingAndLogError(o.waitForErr())
return nil, o.DrainHelper()
case checkpoint.afterInitialScanCompletion:
// The previous checkpoint completed the initial scan and was already
// ingested by the coordinator, so we can gracefully shut down the
Expand All @@ -273,6 +270,18 @@ func (o *offlineInitialScanProcessor) Next() (rowenc.EncDatumRow, *execinfrapb.P
}
return row, nil
}
case stats, ok := <-o.rangeStatsCh:
if !ok {
o.MoveToDrainingAndLogError(o.waitForErr())
return nil, o.DrainHelper()
}

meta, err := replicationutils.StreamRangeStatsToProgressMeta(o.FlowCtx, o.ProcessorID, stats)
if err != nil {
o.MoveToDrainingAndLogError(err)
return nil, o.DrainHelper()
}
return nil, meta
case err := <-o.errCh:
o.MoveToDrainingAndLogError(err)
return nil, o.DrainHelper()
Expand Down Expand Up @@ -345,7 +354,7 @@ func (o *offlineInitialScanProcessor) handleEvent(
return err
}
case crosscluster.CheckpointEvent:
if err := o.checkpoint(ctx, event.GetCheckpoint().ResolvedSpans); err != nil {
if err := o.checkpoint(ctx, event.GetCheckpoint()); err != nil {
return err
}
case crosscluster.SSTableEvent, crosscluster.DeleteRangeEvent:
Expand All @@ -358,9 +367,26 @@ func (o *offlineInitialScanProcessor) handleEvent(
return nil
}

// waitForErr waits for an error to be sent on the error channel and returns the
// error if one is found within the timeout.
func (o *offlineInitialScanProcessor) waitForErr() error {
select {
case err := <-o.errCh:
return err
case <-time.After(10 * time.Second):
logcrash.ReportOrPanic(o.Ctx(), &o.FlowCtx.Cfg.Settings.SV,
"event channel closed but no error found on err channel after 10 seconds")
return nil
}
}

func (o *offlineInitialScanProcessor) checkpoint(
ctx context.Context, resolvedSpans []jobspb.ResolvedSpan,
ctx context.Context, checkpoint *streampb.StreamEvent_StreamCheckpoint,
) error {
if checkpoint == nil {
return errors.New("nil checkpoint event")
}
resolvedSpans := checkpoint.ResolvedSpans
if resolvedSpans == nil {
return errors.New("checkpoint event expected to have resolved spans")
}
Expand Down Expand Up @@ -406,6 +432,15 @@ func (o *offlineInitialScanProcessor) checkpoint(
// shutdown the processor.
o.initialScanCompleted = true
}

if checkpoint.RangeStats != nil {
select {
case o.rangeStatsCh <- checkpoint.RangeStats:
case <-o.stopCh:
case <-ctx.Done():
return ctx.Err()
}
}
return nil
}

Expand Down