From b2835347ba65165d740937faa999ccdcce2511c5 Mon Sep 17 00:00:00 2001 From: Samuel Macbeth Date: Wed, 28 Dec 2022 10:49:27 -0500 Subject: [PATCH] Add element collector --- collectors/ElementCollector.js | 97 ++++++++++++++++++++++++++++++++++ main.js | 2 + 2 files changed, 99 insertions(+) create mode 100644 collectors/ElementCollector.js diff --git a/collectors/ElementCollector.js b/collectors/ElementCollector.js new file mode 100644 index 00000000..fbc8efca --- /dev/null +++ b/collectors/ElementCollector.js @@ -0,0 +1,97 @@ +const fs = require('fs').promises; +const BaseCollector = require('./BaseCollector'); + +class ElementCollector extends BaseCollector { + + id() { + return 'elements'; + } + + /** + * @param {import('./BaseCollector').CollectorInitOptions} options + */ + init(options) { + this.context = options.context; + this.log = options.log; + /** + * @type {import('puppeteer').Frame[]} + */ + this.frames = []; + this.cookieSelectors = this._loadCookieMonsterRules(); + } + + async _loadCookieMonsterRules() { + const contents = String(await fs.readFile('./fanboy-cookiemonster.txt', {encoding: 'utf-8'})); + return contents.split('\n').filter(line => line.startsWith('##')).map(line => line.slice(2)); + } + + /** + * @param {{cdpClient: import('puppeteer').CDPSession, url: string, type: import('./TargetCollector').TargetType}} targetInfo + */ + // eslint-disable-next-line no-unused-vars + addTarget(targetInfo) { + if (targetInfo.type === 'page') { + this.context.pages().then(pages => { + this.page = pages[0]; + this.check = new Promise(resolve => { + this.page.on('load', async () => { + // check CSS rule matches + const selectors = await this.cookieSelectors; + const isMatched = await Promise.all(selectors.map(async selector => (await this.page.$(selector)) !== null)); + const present = selectors.filter((_, i) => isMatched[i]); + const visible = await this.page.evaluate(testSelectors => { + /** + * @param {HTMLElement} elem + */ + function isElementVisible(elem) { + if (!elem) { + return false; + } + if (elem.offsetParent !== null) { + return true; + } + // eslint-disable-next-line no-undef + const css = window.getComputedStyle(elem); + if (css.position === 'fixed' && css.display !== "none") { // fixed elements may be visible even if the parent is not + return true; + } + return false; + } + return testSelectors.filter((/** @type {string} */ s) => { + // eslint-disable-next-line no-undef + const elem = document.querySelectorAll(s); + const results = new Array(elem.length); + elem.forEach((e, i) => { + // check for display: none + // @ts-ignore + results[i] = isElementVisible(e); + if (results[i]) { + e.setAttribute('style', 'border: 4px dashed red;'); + } + }); + return results.some(r => r); + }); + }, present); + resolve({ + present, + visible, + }); + }); + }); + }, () => this.log('Unable to get pages')); + } + } + + /** + * Called after the crawl to retrieve the data. Can be async, can throw errors. + * + * @param {{finalUrl: string, urlFilter?: function(string):boolean}} options + * @returns {Promise|Object} + */ + // eslint-disable-next-line no-unused-vars + getData(options) { + return this.check; + } +} + +module.exports = ElementCollector; \ No newline at end of file diff --git a/main.js b/main.js index 1095c0ab..43256d12 100644 --- a/main.js +++ b/main.js @@ -8,6 +8,7 @@ const TargetCollector = require('./collectors/TargetCollector'); const TraceCollector = require('./collectors/TraceCollector'); const ScreenshotCollector = require('./collectors/ScreenshotCollector'); const CMPCollector = require('./collectors/CMPCollector'); +const ElementCollector = require('./collectors/ElementCollector'); // reexport main pieces of code so that they can be easily imported when this project is used as a dependency // e.g. `const {crawlerConductor} = require('3p-crawler');` @@ -22,4 +23,5 @@ module.exports = { TraceCollector, ScreenshotCollector, CMPCollector, + ElementCollector, }; \ No newline at end of file