Skip to content
This repository was archived by the owner on Sep 30, 2024. It is now read-only.

Commit 2f1faff

Browse files
authored
Embeddings: add progress updates during indexing (#53202)
This adds incremental stats reporting to our embedding indexing jobs and displays it in the UI while the job is processing.
1 parent b67c4a7 commit 2f1faff

File tree

25 files changed

+1018
-225
lines changed

25 files changed

+1018
-225
lines changed

client/web/src/enterprise/site-admin/cody/RepoEmbeddingJobNode.tsx

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ export const RepoEmbeddingJobNode: FC<RepoEmbeddingJobNodeProps> = ({
3636
queuedAt,
3737
startedAt,
3838
failureMessage,
39+
stats,
3940
onCancel,
4041
}) => (
4142
<li className="list-group-item p-2">
@@ -60,6 +61,7 @@ export const RepoEmbeddingJobNode: FC<RepoEmbeddingJobNodeProps> = ({
6061
queuedAt={queuedAt}
6162
startedAt={startedAt}
6263
failureMessage={failureMessage}
64+
stats={stats}
6365
/>
6466
</div>
6567
</div>
@@ -84,14 +86,25 @@ export const RepoEmbeddingJobNode: FC<RepoEmbeddingJobNodeProps> = ({
8486
)
8587

8688
const RepoEmbeddingJobExecutionInfo: FC<
87-
Pick<RepoEmbeddingJobFields, 'state' | 'cancel' | 'finishedAt' | 'failureMessage' | 'queuedAt' | 'startedAt'>
88-
> = ({ state, cancel, finishedAt, queuedAt, startedAt, failureMessage }) => {
89+
Pick<
90+
RepoEmbeddingJobFields,
91+
'state' | 'cancel' | 'finishedAt' | 'failureMessage' | 'queuedAt' | 'startedAt' | 'stats'
92+
>
93+
> = ({ state, cancel, finishedAt, queuedAt, startedAt, failureMessage, stats }) => {
8994
const [isPopoverOpen, setIsPopoverOpen] = useState(false)
95+
const estimatedFinish = calculateEstimatedFinish(
96+
startedAt,
97+
stats.filesScheduled,
98+
stats.filesEmbedded,
99+
stats.filesSkipped
100+
)
101+
90102
return (
91103
<>
92104
{state === RepoEmbeddingJobState.COMPLETED && finishedAt && (
93105
<small>
94-
Completed <Timestamp date={finishedAt} />
106+
Completed embedding {stats.filesEmbedded} files ({stats.filesSkipped} skipped){' '}
107+
<Timestamp date={finishedAt} />
95108
</small>
96109
)}
97110
{state === RepoEmbeddingJobState.CANCELED && finishedAt && (
@@ -114,6 +127,11 @@ const RepoEmbeddingJobExecutionInfo: FC<
114127
<small>
115128
{cancel ? (
116129
'Cancelling ...'
130+
) : estimatedFinish ? (
131+
<>
132+
Expected to finish <Timestamp date={estimatedFinish} /> (
133+
{stats.filesSkipped + stats.filesEmbedded}/{stats.filesScheduled} files)
134+
</>
117135
) : (
118136
<>
119137
Started processing <Timestamp date={startedAt} />
@@ -140,6 +158,31 @@ const RepoEmbeddingJobExecutionInfo: FC<
140158
)
141159
}
142160

161+
function calculateEstimatedFinish(
162+
startedAt: string | null,
163+
filesScheduled: number,
164+
filesEmbedded: number,
165+
filesSkipped: number,
166+
now?: number
167+
): Date | null {
168+
const currentTime = now ?? Date.now()
169+
if (!startedAt) {
170+
return null
171+
}
172+
const startTime = Date.parse(startedAt)
173+
if (filesScheduled === 0) {
174+
// There is a period between when the job starts processing and when
175+
// we know how many files need to be processed. In the case where
176+
// we do not have an update with the number of files scheduled,
177+
// we cannot calculate a meaningful ETA.
178+
return null
179+
}
180+
const proportionFinished = (filesEmbedded + filesSkipped) / filesScheduled
181+
const timeElapsed = currentTime - startTime
182+
const estimatedTotalTime = timeElapsed / proportionFinished
183+
return new Date(startTime + estimatedTotalTime)
184+
}
185+
143186
function getRepoEmbeddingJobStateBadgeVariant(state: RepoEmbeddingJobState): BadgeVariantType {
144187
switch (state) {
145188
case RepoEmbeddingJobState.COMPLETED:

client/web/src/enterprise/site-admin/cody/backend.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,11 @@ const REPO_EMBEDDING_JOB_FRAGMENT = gql`
3333
oid
3434
abbreviatedOID
3535
}
36+
stats {
37+
filesScheduled
38+
filesEmbedded
39+
filesSkipped
40+
}
3641
}
3742
`
3843

@@ -64,6 +69,9 @@ export const useRepoEmbeddingJobsConnection = (
6469
const { repoEmbeddingJobs } = dataOrThrowErrors(result)
6570
return repoEmbeddingJobs
6671
},
72+
options: {
73+
pollInterval: 5000,
74+
},
6775
})
6876

6977
export const SCHEDULE_REPO_EMBEDDING_JOBS = gql`

cmd/frontend/graphqlbackend/embeddings.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,4 +81,11 @@ type RepoEmbeddingJobResolver interface {
8181
Cancel() bool
8282
Repo(ctx context.Context) (*RepositoryResolver, error)
8383
Revision(ctx context.Context) (*GitCommitResolver, error)
84+
Stats(context.Context) (RepoEmbeddingJobStatsResolver, error)
85+
}
86+
87+
type RepoEmbeddingJobStatsResolver interface {
88+
FilesEmbedded() int32
89+
FilesScheduled() int32
90+
FilesSkipped() int32
8491
}

cmd/frontend/graphqlbackend/embeddings.graphql

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,35 @@ type RepoEmbeddingJob implements Node {
217217
The revision at which the repo was embedded.
218218
"""
219219
revision: GitCommit
220+
221+
"""
222+
Statistics about the embeddings index job.
223+
This will be updated periodically while the embeddings job is processing.
224+
"""
225+
stats: RepoEmbeddingsStats!
226+
}
227+
228+
"""
229+
Statistics about an embeddings index job.
230+
This will be updated periodically while the embeddings job is processing.
231+
"""
232+
type RepoEmbeddingsStats {
233+
"""
234+
The number of files scheduled to be embedded.
235+
"""
236+
filesScheduled: Int!
237+
238+
"""
239+
The number of files we generated embeddings for.
240+
This will be updated periodically while the embeddings job is processing.
241+
"""
242+
filesEmbedded: Int!
243+
244+
"""
245+
The number of files skipped.
246+
This will be updated periodically while the embeddings job is processing.
247+
"""
248+
filesSkipped: Int!
220249
}
221250

222251
"""

enterprise/cmd/frontend/internal/embeddings/resolvers/repo_embedding_jobs.go

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,11 @@ func (s *repoEmbeddingJobsConnectionStore) ComputeNodes(ctx context.Context, arg
5757
}
5858
resolvers := make([]graphqlbackend.RepoEmbeddingJobResolver, 0, len(jobs))
5959
for _, job := range jobs {
60-
resolvers = append(resolvers, &repoEmbeddingJobResolver{db: s.db, gitserverClient: s.gitserverClient, job: job})
60+
resolvers = append(resolvers, &repoEmbeddingJobResolver{
61+
db: s.db,
62+
gitserverClient: s.gitserverClient,
63+
job: job,
64+
})
6165
}
6266
return resolvers, nil
6367
}
@@ -149,6 +153,15 @@ func (r *repoEmbeddingJobResolver) Cancel() bool {
149153
return r.job.Cancel
150154
}
151155

156+
func (r *repoEmbeddingJobResolver) Stats(ctx context.Context) (graphqlbackend.RepoEmbeddingJobStatsResolver, error) {
157+
store := repobg.NewRepoEmbeddingJobsStore(r.db)
158+
stats, err := store.GetRepoEmbeddingJobStats(ctx, r.job.ID)
159+
if err != nil {
160+
return nil, err
161+
}
162+
return &repoEmbeddingJobStatsResolver{stats}, nil
163+
}
164+
152165
func (r *repoEmbeddingJobResolver) compute(ctx context.Context) (*graphqlbackend.RepositoryResolver, error) {
153166
r.once.Do(func() {
154167
repo, err := r.db.Repos().Get(ctx, r.job.RepoID)
@@ -189,3 +202,26 @@ func unmarshalRepoEmbeddingJobID(id graphql.ID) (jobID int, err error) {
189202
err = relay.UnmarshalSpec(id, &jobID)
190203
return
191204
}
205+
206+
type repoEmbeddingJobStatsResolver struct {
207+
stats repobg.EmbedRepoStats
208+
}
209+
210+
func (r *repoEmbeddingJobStatsResolver) FilesScheduled() int32 {
211+
return int32(r.stats.CodeIndexStats.FilesScheduled + r.stats.TextIndexStats.FilesScheduled)
212+
}
213+
214+
func (r *repoEmbeddingJobStatsResolver) FilesEmbedded() int32 {
215+
return int32(r.stats.CodeIndexStats.FilesEmbedded + r.stats.TextIndexStats.FilesEmbedded)
216+
}
217+
218+
func (r *repoEmbeddingJobStatsResolver) FilesSkipped() int32 {
219+
skipped := 0
220+
for _, count := range r.stats.CodeIndexStats.FilesSkipped {
221+
skipped += count
222+
}
223+
for _, count := range r.stats.TextIndexStats.FilesSkipped {
224+
skipped += count
225+
}
226+
return int32(skipped)
227+
}

enterprise/cmd/worker/internal/embeddings/repo/handler.go

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ import (
99
codeintelContext "github.com/sourcegraph/sourcegraph/enterprise/internal/codeintel/context"
1010
edb "github.com/sourcegraph/sourcegraph/enterprise/internal/database"
1111
"github.com/sourcegraph/sourcegraph/enterprise/internal/embeddings"
12-
repoembeddingsbg "github.com/sourcegraph/sourcegraph/enterprise/internal/embeddings/background/repo"
12+
bgrepo "github.com/sourcegraph/sourcegraph/enterprise/internal/embeddings/background/repo"
1313
"github.com/sourcegraph/sourcegraph/enterprise/internal/embeddings/embed"
1414
"github.com/sourcegraph/sourcegraph/internal/actor"
1515
"github.com/sourcegraph/sourcegraph/internal/api"
@@ -27,10 +27,10 @@ type handler struct {
2727
uploadStore uploadstore.Store
2828
gitserverClient gitserver.Client
2929
contextService embed.ContextService
30-
repoEmbeddingJobsStore repoembeddingsbg.RepoEmbeddingJobsStore
30+
repoEmbeddingJobsStore bgrepo.RepoEmbeddingJobsStore
3131
}
3232

33-
var _ workerutil.Handler[*repoembeddingsbg.RepoEmbeddingJob] = &handler{}
33+
var _ workerutil.Handler[*bgrepo.RepoEmbeddingJob] = &handler{}
3434

3535
// The threshold to embed the entire file is slightly larger than the chunk threshold to
3636
// avoid splitting small files unnecessarily.
@@ -49,7 +49,7 @@ var splitOptions = codeintelContext.SplitOptions{
4949
ChunkEarlySplitTokensThreshold: embeddingChunkEarlySplitTokensThreshold,
5050
}
5151

52-
func (h *handler) Handle(ctx context.Context, logger log.Logger, record *repoembeddingsbg.RepoEmbeddingJob) error {
52+
func (h *handler) Handle(ctx context.Context, logger log.Logger, record *bgrepo.RepoEmbeddingJob) error {
5353
if !conf.EmbeddingsEnabled() {
5454
return errors.New("embeddings are not configured or disabled")
5555
}
@@ -99,6 +99,12 @@ func (h *handler) Handle(ctx context.Context, logger log.Logger, record *repoemb
9999
return err
100100
}
101101

102+
reportStats := func(stats *bgrepo.EmbedRepoStats) {
103+
if err := h.repoEmbeddingJobsStore.UpdateRepoEmbeddingJobStats(ctx, record.ID, stats); err != nil {
104+
logger.Error("failed to update embedding stats", log.Error(err))
105+
}
106+
}
107+
102108
repoEmbeddingIndex, toRemove, stats, err := embed.EmbedRepo(
103109
ctx,
104110
embeddingsClient,
@@ -107,11 +113,14 @@ func (h *handler) Handle(ctx context.Context, logger log.Logger, record *repoemb
107113
ranks,
108114
opts,
109115
logger,
116+
reportStats,
110117
)
111118
if err != nil {
112119
return err
113120
}
114121

122+
reportStats(stats) // final, complete report
123+
115124
logger.Info(
116125
"finished generating repo embeddings",
117126
log.String("repoName", string(repo.Name)),

enterprise/internal/embeddings/BUILD.bazel

Lines changed: 0 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

enterprise/internal/embeddings/background/repo/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)