diff --git a/package.json b/package.json index 9e5316afa3..3fe952765d 100644 --- a/package.json +++ b/package.json @@ -174,8 +174,7 @@ "postcss": "latest", "postcss-focus": "latest", "simple-git-hooks": "latest", - "standard": "latest", - "standard-markdown": "latest" + "standard": "latest" }, "engines": { "node": ">= 16" @@ -189,7 +188,7 @@ "coverage": "c8 report --reporter=text-lcov > coverage/lcov.info", "dev": "concurrently \"gulp\" \"npm run dev:server\"", "dev:server": "browser-sync start --server --files \"index.html, README.md, static/**/*.(css|js)\"", - "lint": "standard-markdown README.md && standard", + "lint": "standard", "pretest": "npm run lint", "release": "lerna publish --yes --sort --conventional-commits -m \"chore(release): %s\" --create-release github", "test": "c8 pnpm --recursive test", @@ -213,9 +212,6 @@ "prettier-standard", "standard --fix" ], - "*.md": [ - "standard-markdown" - ], "package.json": [ "finepack" ] diff --git a/packages/metascraper-helpers/package.json b/packages/metascraper-helpers/package.json index bb76be2003..db88980034 100644 --- a/packages/metascraper-helpers/package.json +++ b/packages/metascraper-helpers/package.json @@ -3,7 +3,7 @@ "description": "Collection of helper functions used by metascraper", "homepage": "https://github.com/microlinkhq/metascraper/packages/metascraper-helpers", "version": "5.46.11", - "main": "index.js", + "main": "src/index.js", "author": { "email": "hello@microlink.io", "name": "microlink.io", diff --git a/packages/metascraper-helpers/index.js b/packages/metascraper-helpers/src/index.js similarity index 93% rename from packages/metascraper-helpers/index.js rename to packages/metascraper-helpers/src/index.js index 9668822441..bf65c751ee 100644 --- a/packages/metascraper-helpers/index.js +++ b/packages/metascraper-helpers/src/index.js @@ -5,7 +5,6 @@ const debug = require('debug-logfmt')('metascraper:find-rule') const condenseWhitespace = require('condense-whitespace') const { getExtension: mimeExtension } = require('mime') const capitalize = require('microsoft-capitalize') -const { JSDOM, VirtualConsole } = require('jsdom') const isRelativeUrl = require('is-relative-url') const fileExtension = require('file-extension') const _normalizeUrl = require('normalize-url') @@ -466,40 +465,6 @@ const composeRule = const has = value => value !== undefined && !Number.isNaN(value) && hasValues(value) -const loadIframe = (url, $, { timeout = 5000 } = {}) => - new Promise(resolve => { - const dom = new JSDOM($.html(), { - url, - virtualConsole: new VirtualConsole(), - runScripts: 'dangerously', - resources: 'usable' - }) - - const done = (html = '') => resolve($.load(html)) - - const listen = (element, method, fn) => - element[`${method}EventListener`]('load', fn, { - capture: true, - once: true, - passive: true - }) - - const iframe = dom.window.document.querySelector('iframe') - if (!iframe) return done() - - const timer = setTimeout(() => { - listen(iframe, 'remove', load) - done() - }, timeout) - - function load () { - clearTimeout(timer) - done(iframe.contentDocument.documentElement.outerHTML) - } - - listen(iframe, 'add', load) - }) - const getUrls = input => String(input).match(urlRegexForMatch) ?? [] module.exports = { @@ -536,7 +501,7 @@ module.exports = { iso6393, jsonld, lang, - loadIframe, + loadIframe: require('./load-iframe'), logo, memoizeOne, mimeExtension, diff --git a/packages/metascraper-helpers/src/load-iframe/index.js b/packages/metascraper-helpers/src/load-iframe/index.js new file mode 100644 index 0000000000..c12f965288 --- /dev/null +++ b/packages/metascraper-helpers/src/load-iframe/index.js @@ -0,0 +1,18 @@ +'use strict' + +const { Worker } = require('worker_threads') +const path = require('path') + +const SCRIPT_PATH = path.resolve(__dirname, 'worker.js') + +module.exports = (url, $, { timeout = 5000 } = {}) => { + const worker = new Worker(SCRIPT_PATH, { + workerData: { url, html: $.html(), timeout }, + stdout: true, + stderr: true + }) + const { promise, resolve, reject } = Promise.withResolvers() + worker.on('message', html => resolve($.load(html || ''))) + worker.on('error', reject) + return promise +} diff --git a/packages/metascraper-helpers/src/load-iframe/worker.js b/packages/metascraper-helpers/src/load-iframe/worker.js new file mode 100644 index 0000000000..0a23956884 --- /dev/null +++ b/packages/metascraper-helpers/src/load-iframe/worker.js @@ -0,0 +1,37 @@ +'use strict' + +const { workerData, parentPort } = require('node:worker_threads') +const { JSDOM, VirtualConsole } = require('jsdom') + +async function main ({ url, html, timeout }) { + const dom = new JSDOM(html, { + url, + virtualConsole: new VirtualConsole(), + runScripts: 'dangerously', + resources: 'usable' + }) + + const iframe = dom.window.document.querySelector('iframe') + if (!iframe) return + + let timeoutId + + const waitForIframe = new Promise(resolve => { + iframe.addEventListener( + 'load', + () => { + clearTimeout(timeoutId) + resolve(iframe.contentDocument.documentElement.outerHTML) + }, + { once: true } + ) + }) + + const timeoutReached = new Promise( + resolve => (timeoutId = setTimeout(resolve, timeout)) + ) + + return Promise.race([waitForIframe, timeoutReached]) +} + +main(workerData).then(html => parentPort.postMessage(html)) diff --git a/packages/metascraper-helpers/test/index.js b/packages/metascraper-helpers/test/index.js index cd26d6d34d..2cdfd5bbd7 100644 --- a/packages/metascraper-helpers/test/index.js +++ b/packages/metascraper-helpers/test/index.js @@ -29,7 +29,7 @@ const { parseUrl, url, video -} = require('..') +} = require('../src') const measure = fn => { const time = process.hrtime() diff --git a/packages/metascraper-helpers/test/load-iframe.js b/packages/metascraper-helpers/test/load-iframe.js index ba03d3760d..79b8a62429 100644 --- a/packages/metascraper-helpers/test/load-iframe.js +++ b/packages/metascraper-helpers/test/load-iframe.js @@ -22,3 +22,14 @@ test('wait `load` event', async t => { const $iframe = await loadIframe(url, $) t.true($iframe.html().includes('twitter:player')) }) + +test('markup is correct', async t => { + const url = + 'https://saas.transistor.fm/episodes/paul-jarvis-gaining-freedom-by-building-an-indie-business' + const src = 'https://share.transistor.fm/e/e83b42d0' + const $ = await loadIframe( + url, + cheerio.load(``) + ) + t.snapshot($.html()) +}) diff --git a/packages/metascraper-helpers/test/snapshots/load-iframe.js.md b/packages/metascraper-helpers/test/snapshots/load-iframe.js.md new file mode 100644 index 0000000000..c1b83cc0e2 --- /dev/null +++ b/packages/metascraper-helpers/test/snapshots/load-iframe.js.md @@ -0,0 +1,680 @@ +# Snapshot report for `test/load-iframe.js` + +The actual snapshot is saved in `load-iframe.js.snap`. + +Generated by [AVA](https://avajs.dev). + +## markup is correct + +> Snapshot 1 + + `␊ + ␊ + ␊ + Paul Jarvis: gaining freedom by building an indie business - Audio player␊ + ␊ + ␊ + ␊ + ␊ + ␊ + ␊ +
␊ + ␊ + ␊ +
␊ + ␊ +
␊ + ␊ +
␊ + ␊ + ␊ + ␊ +
␊ + ␊ +
␊ +
␊ +
␊ + Build Your SaaS␊ + Trailer␊ + Bonus␊ + Episode 155␊ + Season 1␊ +
␊ +
␊ +
␊ +

Paul Jarvis: gaining freedom by building an indie business

␊ +

␊ + Paul Jarvis: gaining freedom by building an indie businessPaul Jarvis: gaining freedom by building an indie business␊ +

␊ +
␊ +
␊ +
␊ + ␊ +
␊ + ␊ + ␊ + ␊ +
␊ +
␊ + ␊ +
␊ + 00:00␊ +
␊ + ␊ + ␊ + ␊ + ␊ + ␊ + ␊ + ␊ + ␊ + ␊ +
␊ + ␊ +
␊ +
␊ +
␊ +
␊ + ␊ + ␊ + ␊ + ␊ + ␊ + ␊ +
␊ + ␊ + ␊ + ␊ +
␊ + ␊ + ␊ + ␊ +
␊ + ␊ + ␊ + ␊ + ␊ + ␊ + ␊ + ␊ + ␊ + ␊ +
␊ + ␊ +
␊ + ␊ + ␊ + ␊ + ` diff --git a/packages/metascraper-helpers/test/snapshots/load-iframe.js.snap b/packages/metascraper-helpers/test/snapshots/load-iframe.js.snap new file mode 100644 index 0000000000..9521e35cf0 Binary files /dev/null and b/packages/metascraper-helpers/test/snapshots/load-iframe.js.snap differ diff --git a/packages/metascraper-readability/benchmark/document-write.js b/packages/metascraper-readability/benchmark/document-write.js index 3cd4a4d38a..0bbd01a7b9 100644 --- a/packages/metascraper-readability/benchmark/document-write.js +++ b/packages/metascraper-readability/benchmark/document-write.js @@ -2,9 +2,10 @@ const { Window } = require('happy-dom') const { readFileSync } = require('fs') +const path = require('path') const url = 'https://arxiv.org/pdf/2412.06592' -const html = readFileSync('./fixture.html', 'utf8') +const html = readFileSync(path.resolve(__dirname, './fixture.html'), 'utf8') const isEqual = (value1, value2) => JSON.stringify(value1) === JSON.stringify(value2) @@ -20,7 +21,7 @@ const cases = { const window = new Window({ url }) const document = window.document document.write(html) - await window.happyDOM.waitUntilComplete() + // await window.happyDOM.waitUntilComplete() return document } } diff --git a/packages/metascraper-readability/src/index.js b/packages/metascraper-readability/src/index.js index 76b116fc57..3531a9686f 100644 --- a/packages/metascraper-readability/src/index.js +++ b/packages/metascraper-readability/src/index.js @@ -9,7 +9,9 @@ const SCRIPT_PATH = path.resolve(__dirname, 'worker.js') const readability = asyncMemoizeOne((url, html, readabilityOpts) => { const worker = new Worker(SCRIPT_PATH, { - workerData: { url, html, readabilityOpts } + workerData: { url, html, readabilityOpts }, + stdout: true, + stderr: true }) const { promise, resolve, reject } = Promise.withResolvers() worker.on('message', message => resolve(JSON.parse(message))) diff --git a/packages/metascraper-readability/src/worker.js b/packages/metascraper-readability/src/worker.js index 36d4fdad14..bf4a53f905 100644 --- a/packages/metascraper-readability/src/worker.js +++ b/packages/metascraper-readability/src/worker.js @@ -15,7 +15,7 @@ const getDocument = ({ url, html }) => { const { Window } = require('happy-dom') const window = new Window({ url }) const document = window.document - document.documentElement.innerHTML = html + document.write(html) return document } diff --git a/packages/metascraper/test/integration/the-verge/snapshots/index.js.md b/packages/metascraper/test/integration/the-verge/snapshots/index.js.md index 78d0f557dd..7a0805ce5e 100644 --- a/packages/metascraper/test/integration/the-verge/snapshots/index.js.md +++ b/packages/metascraper/test/integration/the-verge/snapshots/index.js.md @@ -10,7 +10,7 @@ Generated by [AVA](https://avajs.dev). { audio: null, - author: null, + author: 'Nick Statt', date: '2016-05-24T20:49:03.000Z', description: 'Apple could open Siri to third-party apps very soon', image: 'https://img.connatix.com/pid-13bd7676-705c-4894-8449-b8e67fcddc1f/8ffb59d5-b093-4d68-aa90-1dd263c5d84d/1_th.jpg?crop=600:338,smart&width=600&height=338&quality=60&fit=crop', diff --git a/packages/metascraper/test/integration/the-verge/snapshots/index.js.snap b/packages/metascraper/test/integration/the-verge/snapshots/index.js.snap index 4b80c682d4..e0dff8ea67 100644 Binary files a/packages/metascraper/test/integration/the-verge/snapshots/index.js.snap and b/packages/metascraper/test/integration/the-verge/snapshots/index.js.snap differ diff --git a/packages/metascraper/test/integration/transistor/snapshots/index.js.md b/packages/metascraper/test/integration/transistor/snapshots/index.js.md index b5c4bef335..f6c56afd21 100644 --- a/packages/metascraper/test/integration/transistor/snapshots/index.js.md +++ b/packages/metascraper/test/integration/transistor/snapshots/index.js.md @@ -15,7 +15,7 @@ Generated by [AVA](https://avajs.dev). description: null, image: 'https://images.transistor.fm/file/transistor/images/episode/263474/medium_1590257639-artwork.jpg', lang: null, - logo: 'https://transistor.fm/favicon.ico', + logo: 'https://assets.transistor.fm/assets/favicon-6e847568a3fbd949ba82be5cb10951e28b51f7f58d6e573235b0cddf494de2c0.ico', publisher: '#032 – Before and After Product-Market Fit with Peter and Calvin from Segment', title: 'Transistor Embed | #032 – Before and After Product-Market Fit with Peter and Calvin from Segment', url: 'https://share.transistor.fm/e/70c487ed', diff --git a/packages/metascraper/test/integration/transistor/snapshots/index.js.snap b/packages/metascraper/test/integration/transistor/snapshots/index.js.snap index 06b77c31bd..b986c54e47 100644 Binary files a/packages/metascraper/test/integration/transistor/snapshots/index.js.snap and b/packages/metascraper/test/integration/transistor/snapshots/index.js.snap differ diff --git a/packages/metascraper/test/integration/vimeo/snapshots/index.js.md b/packages/metascraper/test/integration/vimeo/snapshots/index.js.md index 8de79a0620..7b1c2f3543 100644 --- a/packages/metascraper/test/integration/vimeo/snapshots/index.js.md +++ b/packages/metascraper/test/integration/vimeo/snapshots/index.js.md @@ -13,7 +13,7 @@ Generated by [AVA](https://avajs.dev). author: 'MEGAFORCE', date: '2021-10-22T08:58:48.000Z', description: 'FULL CREDITS: AGENCY & PRODUCTION: @riffrafffilms CREATIVE & DIRECTION: @the_megaforce EXEC PRODUCER/OWNER: Matthew Fone PRODUCER: @cathyhoodx PRODUCTION…', - image: 'https://i.vimeocdn.com/custom_asset/21a684b9352e090b70ace38910e58e70', + image: 'https://i.vimeocdn.com/filter/overlay?src0=https%3A%2F%2Fi.vimeocdn.com%2Fvideo%2F1280513620-2e984a8e28be47e615461106d5ff44a4e8d9bf6f03d53c7e2_1280x853&src1=https%3A%2F%2Ff.vimeocdn.com%2Fimages_v6%2Fshare%2Fplay_icon_overlay.png', lang: 'en', logo: 'https://i.vimeocdn.com/favicon/main-touch_180', publisher: 'Vimeo', diff --git a/packages/metascraper/test/integration/vimeo/snapshots/index.js.snap b/packages/metascraper/test/integration/vimeo/snapshots/index.js.snap index 48a3655a25..94fe6b2c62 100644 Binary files a/packages/metascraper/test/integration/vimeo/snapshots/index.js.snap and b/packages/metascraper/test/integration/vimeo/snapshots/index.js.snap differ