-
Notifications
You must be signed in to change notification settings - Fork 3
/
crawl.js
284 lines (251 loc) · 14.4 KB
/
crawl.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
const config = require('./config.json');
const MongoClient = require('mongodb').MongoClient;
const axios = require('axios');
axios.defaults.timeout = 60*1000; // 60s = 60000 ms
const mongo = new MongoClient(config.dbUrl, { useNewUrlParser: true, useUnifiedTopology: true } );
const dbqueries = require('./src/dbqueries.js');
const starttimestamp = new Date().toJSON();
const verbose = process.argv[2] == '--verbose';
console.log('Connecting to database server...');
mongo.connect(async (error, client) => {
if(error != null) {
console.log();
console.log('An error occurred while connecting to the database server (' + error.name + ': ' + error.message + ')');
if(verbose) {
console.log(error);
}
console.log();
console.log('CRAWLING ABORTED!')
return 1; // abort with non-zero exit code
}
const db = client.db(config.dbName);
const collection = db.collection('raw');
console.log('Connected to database server.');
console.log('');
console.log('Setting up database indexes...');
try {
await db.collection('raw').createIndex({service: 1, api_version: 1, path: 1}, { name: 'service-apiversion-path_unique', unique: true });
await db.collection('backends').createIndex({backend: 1}, { name: 'backend_unique', unique: true });
await db.collection('backends').createIndex({service: 1, api_version: 1}, { name: 'service-apiversion_unique', unique: true });
await db.collection('collections').createIndex({service: 1, api_version: 1, id: 1}, { name: 'service-apiversion-id_unique', unique: true });
await db.collection('collections').createIndex({id: "text", title: "text", description: "text"}, { name: 'id-title-description_text' });
await db.collection('processes').createIndex({service: 1, api_version: 1, id: 1}, { name: 'service-apiversion-id_unique', unique: true });
await db.collection('processes').createIndex({id: "text", summary: "text", description: "text", "returns.description": "text"}, {name: 'id-summary-description_text'});
console.log('Set up database indexes.');
}
catch(error) {
console.log('An error occurred while setting up database indexes (' + error.name + ': ' + error.message + ')');
if(verbose) {
console.log(error);
}
}
console.log('');
const endpoints = [
'/collections',
'/processes',
'/output_formats',
'/file_formats',
'/service_types',
'/udf_runtimes'
];
let allIndividualBackends = [];
let allFailedServices = [];
console.log('Crawling all backends... (timeout per request is ' + (axios.defaults.timeout/1000) + ' seconds)');
console.log('');
for (var name in config.backends) {
var serviceUrl = config.backends[name].replace(/\/$/, ''); // always without trailing slash
var url = serviceUrl + '/.well-known/openeo';
let individualBackends = {};
console.log(' - ' + name + ' (well-known document: ' + url + ')');
// enforce HTTPS
if(! url.startsWith('https')) {
console.log("REFUSING to crawl insecure service " + serviceUrl + " that does not use HTTPS.\n");
continue;
}
try {
var response = await axios(url);
response.data.versions
.filter(b => ! b.api_version.startsWith('0.3.')) // the Hub doesn't support openEO API v0.3.x anymore
.forEach(b => individualBackends[b.api_version] = b.url.replace(/\/$/, '')); // URL always without trailing slash
allIndividualBackends = allIndividualBackends.concat(Object.keys(individualBackends).map(version => serviceUrl+'@'+version));
}
catch(error) {
console.log('An error occurred while getting or reading ' + url + ' (' + error.name + ': ' + error.message + ')');
if(verbose) {
console.log(error);
}
allFailedServices.push(serviceUrl);
}
console.log('');
for (var api_version in individualBackends) {
let backendUrl = individualBackends[api_version];
// enfore HTTPS
if(! backendUrl.startsWith('https')) {
console.log("REFUSING to crawl insecure backend " + backendUrl + " that does not use HTTPS.\n");
continue;
}
try {
console.log(' - ' + backendUrl + ' ...');
var paths = [];
const req = await axios(backendUrl+'/');
const caps = req.data.endpoints
.filter(e => e.methods.map(m => m.toLowerCase()).indexOf('get') != -1) // only keep those that have a GET method
.map(e => e.path.replace(/{.*}/g,'{}')); // replace parameter names with nothing to ease querying
const hasEndpoint = (path) => caps.indexOf(path) != -1;
// add all standard endpoints that are supported
paths.push('/');
paths = paths.concat(endpoints.filter(hasEndpoint));
// if `/collections/{id}` is supported: add the individual collections too
try {
if(hasEndpoint('/collections') && hasEndpoint('/collections/{}')) {
const collections = (await axios(backendUrl+'/collections')).data.collections;
paths = paths.concat(collections.map(c => '/collections/' + c.id));
}
}
catch(error) {
console.log('An error occurred while gathering collection detail URLs for ' + backendUrl + ' (' + error.name + ': ' + error.message + ')');
if(verbose) {
console.log(error);
}
}
}
catch(error) {
console.log('An error occurred while gathering endpoint URLs for ' + backendUrl + ' (' + error.name + ': ' + error.message + ')');
if(verbose) {
console.log(error);
}
}
var bulkNotice = false;
for(var index in paths) {
var path = paths[index];
var isCollectionDetail = path.indexOf('/collections/') != -1;
if(!isCollectionDetail || verbose) {
console.log(' - Downloading ' + backendUrl+path + ' ...');
}
if(!bulkNotice && !verbose && isCollectionDetail) {
console.log(' - Downloading details for all ' + (paths.length - index) + ' collections... (only outputting errors)')
bulkNotice = true;
}
try {
var response = await axios(backendUrl+path);
// extract backend title (if applicable)
if(path == '/' && response.data.title) {
backendTitle = response.data.title;
}
// save to database
var data = response.data;
try {
// In the $set part, findOneAndUpdate doesn't allow field names that contain '.' or '$', see https://jira.mongodb.org/browse/SERVER-30575
// And that despite the MongoDB server allowing this since version 3.6, see https://docs.mongodb.com/v4.0/reference/limits/#Restrictions-on-Field-Names
/*
await collection.findOneAndUpdate(
{ service: serviceUrl, api_version: api_version, path: path },
{ $set: {
backend: backendUrl,
backendTitle: backendTitle,
group: name,
content: data,
retrieved: new Date().toJSON(),
unsuccessfulCrawls: 0
}},
{ upsert: true }
)
*/
// Therefore do the same behaviour (an upsert) manually:
// (as officially suggested by MongoDB staff at https://jira.mongodb.org/browse/SERVER-30575?focusedCommentId=1821530#comment-1821530)
// (we can't use updateOne because that does the same annoying check as findOneAndUpdate, only insertOne bypasses it, therefore we do a delete+insert instead of an update)
let foundItem = await collection.findOne({ service: serviceUrl, api_version: api_version, path: path });
if(foundItem) await collection.deleteOne({ service: serviceUrl, api_version: api_version, path: path });
await collection.insertOne({
service: serviceUrl,
api_version: api_version,
path: path,
backend: backendUrl,
backendTitle: backendTitle,
group: name,
content: data,
retrieved: new Date().toJSON(),
unsuccessfulCrawls: 0
});
}
catch(error) {
console.log('An error occurred while writing ' + backendUrl+path + ' to the database (' + error.name + ': ' + error.message + ')');
if(verbose) {
console.log(error);
}
}
}
catch(error) {
console.log('An error occurred while downloading ' + backendUrl+path + ' (' + error.name + ': ' + error.message + ')');
if(verbose) {
console.log(error);
}
}
};
console.log('');
}
console.log('');
}
// once all requests have finished
try {
console.log('');
console.log('Finished crawling of all backends.');
console.log('');
console.log('Processing data...');
// Delete all entries that belong to a group that was meanwhile deleted (or renamed)
await collection.deleteMany({ group: { $not: { $in: Object.keys(config.backends) } } });
// Delete all entries that don't belong to one of the backends that are listed in the currently configured services's well-known documents
// But exempt those that failed to download. The two conditions are implicitly connected with AND.
// See also issue #79, https://stackoverflow.com/q/63937811, and the MongoDB docs for "$expr" and "$in (aggregation)"
// Note that the two "$in" are NOT exactly the same operator (one is from the query lanuage, one from the aggregation framework)
await collection.deleteMany({
$expr: { $not: { $in: [ {$concat:["$service","@","$api_version"]}, allIndividualBackends ] } },
service: { $not: { $in: allFailedServices } }
});
// Increase `unsucessfulCrawls` counter of items that were not updated in this run
await collection.updateMany({retrieved: {$lt: starttimestamp}}, {$inc: {unsuccessfulCrawls: 1}});
// Delete `/collection/{id}` documents that are no longer referenced from their main `/collections` document
candidates = await collection.find({unsuccessfulCrawls: {$gte: 1}, path: {$regex: /^\/collections\/.+$/}}).toArray(); // `unsuccessfulCrawls` of legit candidates *should* always be ==1 (not ==0 because then they would still be in the main collection document, not >1 because then they would already have been removed during the previous crawl, but use >=1 anyway)
whitelist = await collection.find({path: "/collections"}).toArray(); // get "ground truth" for *all* backends
accidental = whitelist.filter(b => !(typeof b == 'object' && typeof b.content == 'object' && Array.isArray(b.content.collections))).map(b => b.backend);
todelete = candidates.filter(c =>
accidental.indexOf(c.backend) == -1 && // don't delete if main `/collections` document seems invalid
whitelist.find(w => w.backend == c.backend) // use the correct backend for the check
.content.collections.some(c2 => c2.id == c.content.id) == false // keep candidate for deletion if it's not found in its backend's main `/collections` document
);
await collection.deleteMany({_id: {$in: todelete.map(e => e._id)}}); // actually delete remaining candidates
// Similar (not identical!) query (relies solely on `unsuccessfulCrawls` and DOES NOT check the actual ground truth)
// collection.deleteMany({unsuccessfulCrawls: {$gte: 1}, path: {$regex: /^\/collections\/.+$/}});
// Delete documents that have reached the configured threshold of maximum unsuccessful crawls
await collection.deleteMany({unsuccessfulCrawls: {$gte: config.unsuccessfulCrawls.deleteAfter}});
// Get all collections as usual, but in the end remove `id` from result to avoid "duplicate key" errors and output.
// Call `hasNext` because as long as there's no I/O request the Mongo Node driver doesn't actually execute the pipeline.
await collection.aggregate(dbqueries.GET_ALL_BACKENDS_PIPELINE .concat([{$project: {_id: 0}}, {$out: 'backends'}])) .hasNext();
await collection.aggregate(dbqueries.GET_ALL_COLLECTIONS_PIPELINE.concat([{$project: {_id: 0}}, {$out: 'collections'}])).hasNext();
await collection.aggregate(dbqueries.GET_ALL_PROCESSES_PIPELINE .concat([{$project: {_id: 0}}, {$out: 'processes'}])) .hasNext();
console.log('Finished processing data.');
console.log('');
}
catch(error) {
console.log('An error occurred while finalising the crawl process (' + error.name + ': ' + error.message + ')');
if(verbose) {
console.log(error);
}
console.log('');
}
finally {
console.log('Closing database connection...');
try {
await mongo.close();
console.log('Closed database connection.')
}
catch(error) {
console.log('An error occurred while closing the database connection (' + error.name + ': ' + error.message + ')');
if(verbose) {
console.log(error);
}
}
console.log('');
console.log('DONE!');
}
});