From b5805d55632f32f7e09735ca51698db9fdf6388d Mon Sep 17 00:00:00 2001 From: Juan Antonio Osorio Date: Sat, 30 May 2026 12:25:33 +0300 Subject: [PATCH] fix(github): create domain accounts for non-committer authors (#8886) ConvertAccounts sourced the domain `accounts` table FROM _tool_github_accounts, which is only populated for users we collected full profiles for (effectively, committers). Issue and PR authors who never committed were written into _tool_github_repo_accounts but never converted, so issues.creator_id and pull_requests.author_id pointed at accounts rows that didn't exist. Source ConvertAccounts FROM _tool_github_repo_accounts LEFT JOIN _tool_github_accounts instead, so every user the repo references gets a domain account, enriched with profile detail when we have it and login-only otherwise. The domain id uses the same generator the issue/PR convertors use, so the FKs line up. Also emit a repo_account for a PR's merged_by user so pull_requests.merged_by_id resolves too. The query stays MySQL/PostgreSQL-agnostic (COALESCE, no backtick quoting, parameterized via the dal) and mirrors the join already in account_org_collector.go. Adds the non-committer orphan case to the e2e fixture plus a referential- integrity assertion in TestAccountDataFlow. Verified on both MySQL and PostgreSQL. Co-Authored-By: Claude Opus 4.8 (1M context) --- backend/plugins/github/e2e/account_test.go | 33 ++++++++++ .../raw_tables/_tool_github_repo_accounts.csv | 1 + .../github/e2e/snapshot_tables/account.csv | 15 +++-- .../plugins/github/tasks/account_convertor.go | 64 +++++++++++++++---- backend/plugins/github/tasks/pr_extractor.go | 10 +++ 5 files changed, 102 insertions(+), 21 deletions(-) diff --git a/backend/plugins/github/e2e/account_test.go b/backend/plugins/github/e2e/account_test.go index 817ef6fab22..7b6466aba23 100644 --- a/backend/plugins/github/e2e/account_test.go +++ b/backend/plugins/github/e2e/account_test.go @@ -20,11 +20,15 @@ package e2e import ( "testing" + "github.com/apache/incubator-devlake/core/dal" "github.com/apache/incubator-devlake/core/models/domainlayer/crossdomain" + "github.com/apache/incubator-devlake/core/models/domainlayer/didgen" "github.com/apache/incubator-devlake/helpers/e2ehelper" "github.com/apache/incubator-devlake/plugins/github/impl" "github.com/apache/incubator-devlake/plugins/github/models" "github.com/apache/incubator-devlake/plugins/github/tasks" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestAccountDataFlow(t *testing.T) { @@ -108,4 +112,33 @@ func TestAccountDataFlow(t *testing.T) { "_raw_data_remark", }, ) + + // Referential-integrity invariant (#8886): every account the repo references in + // _tool_github_repo_accounts must have a domain `accounts` row, so issues.creator_id / + // pull_requests.author_id / merged_by_id never point at a missing account. We generate + // the domain id with the SAME generator the issue/PR convertors use, so this is a + // faithful proxy for the FK join the issue reported as broken. It also fails loudly if a + // future change shrinks ConvertAccounts' coverage or diverges the id generation. + accountIdGen := didgen.NewDomainIdGenerator(&models.GithubAccount{}) + var repoAccounts []models.GithubRepoAccount + require.NoError(t, dataflowTester.Dal.All(&repoAccounts, + dal.Where("repo_github_id = ? AND connection_id = ? AND account_id > 0", + taskData.Options.GithubId, taskData.Options.ConnectionId), + )) + require.NotEmpty(t, repoAccounts, "fixture must reference at least one account") + sawOrphanCase := false + for _, ra := range repoAccounts { + if ra.Login == "milichev" { + sawOrphanCase = true // the non-committer author from the issue repro + } + domainId := accountIdGen.Generate(taskData.Options.ConnectionId, ra.AccountId) + count, err := dataflowTester.Dal.Count( + dal.From(&crossdomain.Account{}), + dal.Where("id = ?", domainId), + ) + require.NoError(t, err) + assert.Equalf(t, int64(1), count, + "orphan FK: repo account %q (id=%d) has no domain accounts row %q", ra.Login, ra.AccountId, domainId) + } + assert.True(t, sawOrphanCase, "fixture should include the non-committer orphan case (milichev)") } diff --git a/backend/plugins/github/e2e/raw_tables/_tool_github_repo_accounts.csv b/backend/plugins/github/e2e/raw_tables/_tool_github_repo_accounts.csv index 8b86664428f..d92baa5e79c 100644 --- a/backend/plugins/github/e2e/raw_tables/_tool_github_repo_accounts.csv +++ b/backend/plugins/github/e2e/raw_tables/_tool_github_repo_accounts.csv @@ -6,6 +6,7 @@ connection_id,account_id,repo_github_id,login 1,3971390,134018330,ppmoon 1,7496278,134018330,panjf2000 1,8518239,134018330,gitter-badger +1,145564,134018330,milichev 1,11763614,2,Moonlight-Zhao 1,12420699,2,shanghai-Jerry 1,14950473,2,zqkgo diff --git a/backend/plugins/github/e2e/snapshot_tables/account.csv b/backend/plugins/github/e2e/snapshot_tables/account.csv index 1092e7a5296..040815aa80b 100644 --- a/backend/plugins/github/e2e/snapshot_tables/account.csv +++ b/backend/plugins/github/e2e/snapshot_tables/account.csv @@ -1,8 +1,9 @@ id,email,full_name,user_name,avatar_url,organization,_raw_data_params,_raw_data_table,_raw_data_id,_raw_data_remark -github:GithubAccount:1:1052632,runner.mei@,runner,runner-mei,https://avatars.githubusercontent.com/u/1052632?v=4,,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,13, -github:GithubAccount:1:21979,appleboy.tw@gmail.com,Bo-Yi Wu,appleboy,https://avatars.githubusercontent.com/u/21979?v=4,"COSCUP,nodejs-tw,moztw,h5bp,CodeIgniter-TW,drone,Getmore,golangtw,laravel-taiwan,go-xorm,gin-gonic,PHPConf-TW,Mediatek-Cloud,SJFinder,go-gitea,laradock,gin-contrib,tagfans,maintainers,go-training,go-ggz,the-benchmarker,golang-queue","{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,8, -github:GithubAccount:1:3794113,shanhu5739@gmail.com,Derek,shanhuhai5739,https://avatars.githubusercontent.com/u/3794113?v=4,,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,2, -github:GithubAccount:1:3971390,cnliuyunpeng@gmail.com,ppmoon,ppmoon,https://avatars.githubusercontent.com/u/3971390?v=4,,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,14, -github:GithubAccount:1:7496278,i@andypan.me,Andy Pan,panjf2000,https://avatars.githubusercontent.com/u/7496278?v=4,,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,5, -github:GithubAccount:1:8518239,badger@gitter.im,The Gitter Badger,gitter-badger,https://avatars.githubusercontent.com/u/8518239?v=4,,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,9, -github:GithubAccount:1:964542,sarath.sp06@gmail.com,Sarath Sadasivan Pillai,sarathsp06,https://avatars.githubusercontent.com/u/964542?v=4,"exotel,leadmrktr,shellagilehub,odysseyhack,boodltech","{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,1, +github:GithubAccount:1:1052632,runner.mei@,runner,runner-mei,https://avatars.githubusercontent.com/u/1052632?v=4,,,,0, +github:GithubAccount:1:145564,,,milichev,,,,,0, +github:GithubAccount:1:21979,appleboy.tw@gmail.com,Bo-Yi Wu,appleboy,https://avatars.githubusercontent.com/u/21979?v=4,"COSCUP,nodejs-tw,moztw,h5bp,CodeIgniter-TW,drone,Getmore,golangtw,laravel-taiwan,go-xorm,gin-gonic,PHPConf-TW,Mediatek-Cloud,SJFinder,go-gitea,laradock,gin-contrib,tagfans,maintainers,go-training,go-ggz,the-benchmarker,golang-queue",,,0, +github:GithubAccount:1:3794113,shanhu5739@gmail.com,Derek,shanhuhai5739,https://avatars.githubusercontent.com/u/3794113?v=4,,,,0, +github:GithubAccount:1:3971390,cnliuyunpeng@gmail.com,ppmoon,ppmoon,https://avatars.githubusercontent.com/u/3971390?v=4,,,,0, +github:GithubAccount:1:7496278,i@andypan.me,Andy Pan,panjf2000,https://avatars.githubusercontent.com/u/7496278?v=4,,,,0, +github:GithubAccount:1:8518239,badger@gitter.im,The Gitter Badger,gitter-badger,https://avatars.githubusercontent.com/u/8518239?v=4,,,,0, +github:GithubAccount:1:964542,sarath.sp06@gmail.com,Sarath Sadasivan Pillai,sarathsp06,https://avatars.githubusercontent.com/u/964542?v=4,"exotel,leadmrktr,shellagilehub,odysseyhack,boodltech",,,0, diff --git a/backend/plugins/github/tasks/account_convertor.go b/backend/plugins/github/tasks/account_convertor.go index e86780d2746..3401f9df1e2 100644 --- a/backend/plugins/github/tasks/account_convertor.go +++ b/backend/plugins/github/tasks/account_convertor.go @@ -22,6 +22,7 @@ import ( "github.com/apache/incubator-devlake/core/dal" "github.com/apache/incubator-devlake/core/errors" + "github.com/apache/incubator-devlake/core/models/common" "github.com/apache/incubator-devlake/core/models/domainlayer" "github.com/apache/incubator-devlake/core/models/domainlayer/crossdomain" "github.com/apache/incubator-devlake/core/models/domainlayer/didgen" @@ -30,6 +31,19 @@ import ( "github.com/apache/incubator-devlake/plugins/github/models" ) +// repoAccountForConvert is the row projected by ConvertAccounts' query: every +// account referenced by the repo (from _tool_github_repo_accounts), enriched +// with profile detail from _tool_github_accounts when it was collected. The +// embedded NoPKModel carries the RawDataOrigin across to the domain row. +type repoAccountForConvert struct { + Id int + Login string + Name string + Email string + AvatarUrl string + common.NoPKModel +} + func init() { RegisterSubtaskMeta(&ConvertAccountsMeta) } @@ -38,12 +52,12 @@ var ConvertAccountsMeta = plugin.SubTaskMeta{ Name: "Convert Users", EntryPoint: ConvertAccounts, EnabledByDefault: true, - Description: "Convert tool layer table github_accounts into domain layer table accounts", + Description: "Convert every account referenced by the repo (tool layer repo_accounts, enriched by github_accounts) into domain layer table accounts", DomainTypes: []string{plugin.DOMAIN_TYPE_CROSS}, DependencyTables: []string{ - models.GithubAccount{}.TableName(), // cursor - models.GithubRepoAccount{}.TableName(), // cursor - models.GithubAccountOrg{}.TableName()}, // account id gen + models.GithubRepoAccount{}.TableName(), // cursor (every user referenced by the repo) + models.GithubAccount{}.TableName(), // left-join enrichment (profile detail, optional) + models.GithubAccountOrg{}.TableName()}, // org pluck ProductTables: []string{crossdomain.Account{}.TableName()}, } @@ -53,7 +67,7 @@ func ConvertAccounts(taskCtx plugin.SubTaskContext) errors.Error { accountIdGen := didgen.NewDomainIdGenerator(&models.GithubAccount{}) - converter, err := api.NewStatefulDataConverter(&api.StatefulDataConverterArgs[models.GithubAccount]{ + converter, err := api.NewStatefulDataConverter(&api.StatefulDataConverterArgs[repoAccountForConvert]{ SubtaskCommonArgs: &api.SubtaskCommonArgs{ SubTaskContext: taskCtx, Table: RAW_ACCOUNT_TABLE, @@ -62,29 +76,51 @@ func ConvertAccounts(taskCtx plugin.SubTaskContext) errors.Error { Name: data.Options.Name, }, }, + // Source every account referenced by this repo from _tool_github_repo_accounts + // (which the issue/PR/commit extractors populate for any author, assignee, or + // merged-by user), and LEFT JOIN _tool_github_accounts for profile detail when it + // was collected. This guarantees a domain `accounts` row for every CreatorId / + // AuthorId the other convertors emit, instead of only for users who committed. + // SQL is kept DB-agnostic (no backtick quoting, COALESCE not IFNULL) so it runs on + // both MySQL and PostgreSQL. Input: func(stateManager *api.SubtaskStateManager) (dal.Rows, errors.Error) { clauses := []dal.Clause{ - dal.Select("_tool_github_accounts.*"), - dal.From(&models.GithubAccount{}), + dal.Select(`_tool_github_repo_accounts.account_id AS id, + _tool_github_repo_accounts.login AS login, + COALESCE(ga.name, '') AS name, + COALESCE(ga.email, '') AS email, + COALESCE(ga.avatar_url, '') AS avatar_url, + _tool_github_repo_accounts._raw_data_params AS _raw_data_params, + _tool_github_repo_accounts._raw_data_table AS _raw_data_table, + _tool_github_repo_accounts._raw_data_id AS _raw_data_id, + _tool_github_repo_accounts._raw_data_remark AS _raw_data_remark`), + dal.From(&models.GithubRepoAccount{}), + dal.Join(`left join _tool_github_accounts ga on ( + ga.connection_id = _tool_github_repo_accounts.connection_id + AND ga.id = _tool_github_repo_accounts.account_id + )`), dal.Where( - "repo_github_id = ? and _tool_github_accounts.connection_id=?", + `_tool_github_repo_accounts.repo_github_id = ? + AND _tool_github_repo_accounts.connection_id = ? + AND _tool_github_repo_accounts.account_id > 0`, data.Options.GithubId, data.Options.ConnectionId, ), - dal.Join(`left join _tool_github_repo_accounts gra on ( - _tool_github_accounts.connection_id = gra.connection_id - AND _tool_github_accounts.id = gra.account_id - )`), } if stateManager.IsIncremental() { since := stateManager.GetSince() if since != nil { - clauses = append(clauses, dal.Where("_tool_github_accounts.updated_at >= ?", since)) + // Incremental cursor intentionally tracks _tool_github_repo_accounts.updated_at + // (repo membership), not _tool_github_accounts.updated_at (profile freshness): + // account-detail re-enrichment is reconciled on the next full sync. Do not switch + // this back to _tool_github_accounts — that is what left issue/PR-only authors + // orphaned (#8886). + clauses = append(clauses, dal.Where("_tool_github_repo_accounts.updated_at >= ?", since)) } } return db.Cursor(clauses...) }, - Convert: func(githubUser *models.GithubAccount) ([]interface{}, errors.Error) { + Convert: func(githubUser *repoAccountForConvert) ([]interface{}, errors.Error) { // query related orgs var orgs []string err := db.Pluck(`org_login`, &orgs, diff --git a/backend/plugins/github/tasks/pr_extractor.go b/backend/plugins/github/tasks/pr_extractor.go index 802846f301c..9c3f1ca8fc9 100644 --- a/backend/plugins/github/tasks/pr_extractor.go +++ b/backend/plugins/github/tasks/pr_extractor.go @@ -151,6 +151,16 @@ func ExtractApiPullRequests(taskCtx plugin.SubTaskContext) errors.Error { githubPr.AuthorName = githubUser.Login githubPr.AuthorId = githubUser.AccountId } + // Emit a repo_account for the merged-by user too, so pull_requests.merged_by_id + // resolves to a domain account instead of an orphan FK (ConvertAccounts sources + // every referenced user from _tool_github_repo_accounts). + if body.MergedBy != nil { + mergedByUser, err := convertAccount(body.MergedBy, data.Options.GithubId, data.Options.ConnectionId) + if err != nil { + return nil, err + } + results = append(results, mergedByUser) + } for _, label := range body.Labels { results = append(results, &models.GithubPrLabel{ ConnectionId: data.Options.ConnectionId,