-
Notifications
You must be signed in to change notification settings - Fork 2
/
index.mjs
330 lines (315 loc) · 10.2 KB
/
index.mjs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
/**
* Copyright 2024, Arhan Chaudhary, All rights reserved.
*
* This program is *solely* intended for educational purposes. I love making
* software public, but I kindly request for you to be mindful and avoid misuse
* relating to email harvesting/spamming.
*
* Please familiarize yourself with GitHub's Acceptable Use Policies on:
*
* Impersonation https://docs.github.com/en/site-policy/acceptable-use-policies/github-impersonation
* Spam and Inauthentic Activity https://docs.github.com/en/site-policy/acceptable-use-policies/github-acceptable-use-policies#4-spam-and-inauthentic-activity-on-github
* Information Usage Restrictions https://docs.github.com/en/site-policy/acceptable-use-policies/github-acceptable-use-policies#7-information-usage-restrictions
* API Terms https://docs.github.com/en/site-policy/github-terms/github-terms-of-service#h-api-terms
* Excessive Bandwidth Use https://docs.github.com/en/site-policy/acceptable-use-policies/github-acceptable-use-policies#9-excessive-bandwidth-use
*
* And make sure your use of information complies with the GitHub Privacy Statement:
*
* https://docs.github.com/en/site-policy/privacy-policies/github-general-privacy-statement
*
* Thank you!
*/
// Note that not every co-author may be valid
const CO_AUTHOR_COUNT = parseInt(
process.argv
.find((arg) => arg.startsWith("--co-author-count="))
?.substring(18)
);
// how many followers to start searching from in descending order, set to
// Infinity to start from most followed users
const INITIAL_MAX_FOLLOWERS = Infinity;
// how many users to process in a single graphql query, 85 is around optimal
const BATCH_USER_COUNT = 85;
// how many concurrent email queries to make, any more than this gets secondary
// rate limited a lot and has diminishing returns
const CONCURRENCY_COUNT = 3;
// around how many co authors to get for each search user, set to Infinity to
// search every follower
const SEARCH_USER_FOLLOWERS_DEPTH = 500;
if (Number.isNaN(CO_AUTHOR_COUNT)) {
console.error(
`Invalid co_author_count argument: ${process.argv[2]}
Usage: index.mjs --co-author-count=[N]`
);
process.exit(1);
}
import { stripIgnoredCharacters } from "graphql/utilities/stripIgnoredCharacters.js";
import { Octokit } from "octokit";
const octokit = new Octokit({
auth: process.env.GH_PAT,
throttle: {
onRateLimit: (retryAfter, options, octokit) => {
console.warn(
`[WARNING] Request quota exhausted for request ${options.method} ${options.url}`
);
if (options.request.retryCount === 0) {
let now = new Date();
now.setSeconds(now.getSeconds() + retryAfter);
console.warn(
`[WARNING] Retrying after ${retryAfter} seconds: ${now.toISOString()}`
);
return true;
}
},
onSecondaryRateLimit: (retryAfter, options, octokit) => {
console.warn(
`[WARNING] SecondaryRateLimit detected for request ${options.method} ${options.url}`
);
if (options.request.retryCount === 0) {
let now = new Date();
now.setSeconds(now.getSeconds() + retryAfter);
console.warn(
`[WARNING] Retrying after ${retryAfter} seconds: ${now.toISOString()}`
);
return true;
}
},
},
});
function filterInPlace(array, predicate) {
for (let i = array.length - 1; i >= 0; i--) {
if (!predicate(array[i])) {
array.splice(i, 1);
}
}
}
function emailsFromUsersQuery(users, batchIndex) {
return stripIgnoredCharacters(`
{
${users
.slice(
batchIndex * BATCH_USER_COUNT,
(batchIndex + 1) * BATCH_USER_COUNT
)
.map(
({ login, id }, index) => `
_${index + batchIndex * BATCH_USER_COUNT}: user(login: "${login}") {
repositories(first: 1, isFork: false, orderBy: {field: STARGAZERS, direction: DESC}) {
nodes {
defaultBranchRef {
target {
... on Commit {
history(first: 1, author: { id: "${id}" }) {
nodes {
author {
email
}
}
}
}
}
}
}
}
}
`
)
.join("\n")}
}
`);
}
async function* coAuthorsFromUsersIterator(usersBatch, seenUsers) {
let jsonWithEmailsPromises = [];
for (let i = 0; i < CONCURRENCY_COUNT; i++) {
let query = emailsFromUsersQuery(usersBatch, i);
let jsonWithEmailPromise = octokit
.graphql(query)
.then((jsonWithEmails) => jsonWithEmails || Promise.reject())
.catch((e) => {
console.error(
`[ERROR] Error deriving emails for query ${query}: ${e.toString()}`
);
usersBatch.fill(null, i * BATCH_USER_COUNT, (i + 1) * BATCH_USER_COUNT);
});
jsonWithEmailsPromises.push(jsonWithEmailPromise);
}
for (let jsonWithEmails of await Promise.all(jsonWithEmailsPromises)) {
if (!jsonWithEmails) {
// was caught
continue;
}
for (let [i, jsonWithEmail] of Object.entries(jsonWithEmails)) {
// ?.history is needed to not crash on corrupted repositories
let email =
jsonWithEmail.repositories.nodes[0]?.defaultBranchRef?.target?.history
.nodes[0]?.author.email;
// the query label was prefixed by "_"
i = i.substring(1);
let user = usersBatch[i];
// null indicates user was processed and should be removed from the batch
if (
email?.endsWith("@users.noreply.github.com") &&
!seenUsers.has(user.id)
) {
usersBatch[i] = null;
seenUsers.add(user.id);
yield `Co-authored-by: ${user.login} <${email}>`;
} else {
usersBatch[i] = null;
}
}
}
}
async function* followerCoAuthorsIterator(rootUser, usersBatch, seenUsers) {
let rootUserFollowersIterator = octokit.graphql.paginate.iterator(
stripIgnoredCharacters(`
query($cursor: String) {
user(login: "${rootUser.login}") {
followers(first: 100, after: $cursor) {
nodes {
login
id
}
pageInfo {
hasNextPage
endCursor
}
}
}
}
`)
);
// I don't *think* this is necessary, but the logic is very fragile so lets
// just be safe
filterInPlace(usersBatch, (user) => user !== null);
// there are still followers to be processed from the previous user, adjust
// for that
let followerCoAuthorCount = -usersBatch.length;
while (followerCoAuthorCount < SEARCH_USER_FOLLOWERS_DEPTH) {
// if false, one batch wasn't enough; keep batching the group of users
if (usersBatch.length < BATCH_USER_COUNT * CONCURRENCY_COUNT) {
try {
for await (let jsonWithFollowers of rootUserFollowersIterator) {
usersBatch.push(...jsonWithFollowers.user.followers.nodes);
if (usersBatch.length >= BATCH_USER_COUNT * CONCURRENCY_COUNT) {
break;
}
}
if (usersBatch.length < BATCH_USER_COUNT * CONCURRENCY_COUNT) {
console.warn(
`[WARNING] Only processed ${usersBatch.length}/${SEARCH_USER_FOLLOWERS_DEPTH} followers from user ${rootUser.login}`
);
return;
}
} catch (e) {
console.error(
`[ERROR] Error fetching followers for ${
rootUser.login
}: ${e.toString()}`
);
return;
}
}
for await (let coAuthor of coAuthorsFromUsersIterator(
usersBatch,
seenUsers
)) {
yield coAuthor;
followerCoAuthorCount++;
}
filterInPlace(usersBatch, (user) => user !== null);
}
}
async function* searchUsersIterator(searchMaxFollowers) {
let _searchUsersIterator = octokit.graphql.paginate.iterator(
stripIgnoredCharacters(`
query($cursor: String) {
search(query: "${
searchMaxFollowers === Infinity
? "followers:>=0"
: `followers:<${searchMaxFollowers}`
}", type: USER, first: 100, after: $cursor) {
nodes {
... on User {
login
id
}
}
pageInfo {
hasNextPage
endCursor
}
}
}
`)
);
try {
for await (let jsonWithSearchUsers of _searchUsersIterator) {
for (let searchUser of jsonWithSearchUsers.search.nodes) {
// can timeout and return an empty object
if (Object.keys(searchUser).length !== 0) {
yield searchUser;
}
}
}
} catch (e) {
console.error(`[ERROR] Error fetching search users: ${e.toString()}`);
}
}
async function* coAuthorsIterator() {
// I know... but this needs to be sequential or else github complains
// about secondary rate limits
let usersBatch = [];
let seenUsers = new Set();
let maxFollowers = INITIAL_MAX_FOLLOWERS;
let minFollowersLogin;
while (true) {
for await (let searchUser of searchUsersIterator(maxFollowers)) {
console.warn(
`[INFO] Processing followers for ${searchUser.login} at ${Math.round(
(new Date() - start) / 1000
)} seconds in`
);
minFollowersLogin = searchUser.login;
for await (let coAuthor of followerCoAuthorsIterator(
searchUser,
usersBatch,
seenUsers
)) {
yield coAuthor;
}
}
if (minFollowersLogin) {
// if this fails, tough luck
({
user: {
followers: { totalCount: maxFollowers },
},
} = await octokit.graphql(
stripIgnoredCharacters(`
{
user(login: "${minFollowersLogin}") {
followers {
totalCount
}
}
}
`)
));
}
}
}
let coAuthorCount = 0;
let start = new Date();
for await (let coAuthor of coAuthorsIterator()) {
console.log(coAuthor);
if (++coAuthorCount >= CO_AUTHOR_COUNT) {
break;
}
}
if (coAuthorCount < CO_AUTHOR_COUNT) {
console.warn(
`[WARNING] Only processed ${coAuthorCount}/${CO_AUTHOR_COUNT} co-authors`
);
}
console.warn(`\nDone in ${Math.round((new Date() - start) / 1000)} seconds!`);