Skip to content

Commit 92c715d

Browse files
authored
Merge pull request #762 from microlinkhq/next
feat: add `omitPropNames`
2 parents c88ddfd + e17eee9 commit 92c715d

File tree

11 files changed

+173
-64
lines changed

11 files changed

+173
-64
lines changed

README.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,31 @@ Default: `true`
285285

286286
Ensure the URL provided is validated as a [WHATWG URL](https://nodejs.org/api/url.html#url_the_whatwg_url_api) API compliant.
287287

288+
#### omitPropNames
289+
290+
Type: `Set`<br>
291+
Default: `undefined`
292+
293+
A set of property names that should be omitted. When specified, these properties will be missing in the returned metadata objects, and rules related to that will not be computed.
294+
295+
```js
296+
const metascraper = require('metascraper')([
297+
require('metascraper-title')(),
298+
require('metascraper-image')(),
299+
require('metascraper-description')()
300+
])
301+
302+
const html = '<title>Example</title><meta property="og:image" content="image.jpg">'
303+
const url = 'https://example.com'
304+
305+
// Omit the image property
306+
const omitPropNames = new Set(['image'])
307+
const metadata = await metascraper({ url, html, omitPropNames })
308+
309+
console.log(metadata)
310+
// Output: { title: 'Example', image: null, description: null }
311+
```
312+
288313
## Environment Variables
289314

290315
#### METASCRAPER_RE2

packages/metascraper/benchmark/index.js renamed to packages/metascraper/benchmark/merge-rules.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ const benchmark = (fn, iterations = CONFIG.BENCHMARK_ITERATIONS) => {
5252
}
5353

5454
console.log('┌────────────────────────────────────────────────┐')
55-
console.log('│ MergeRules Benchmark │')
55+
console.log('│ MergeRules Benchmark │')
5656
console.log('└────────────────────────────────────────────────┘\n')
5757

5858
// Large comprehensive test case

packages/metascraper/package.json

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@
6161
"@metascraper/helpers": "workspace:*",
6262
"cheerio": "~1.1.0",
6363
"debug-logfmt": "~1.2.3",
64-
"lodash": "~4.17.21",
6564
"whoops": "~5.0.1"
6665
},
6766
"devDependencies": {

packages/metascraper/src/get-data.js

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,19 @@
22

33
const debug = require('debug-logfmt')('metascraper:get-data')
44
const { findRule, has } = require('@metascraper/helpers')
5-
const { map, fromPairs } = require('lodash')
65

7-
const normalizeValue = value => (has(value) ? value : null)
8-
9-
const getData = async ({ rules, name, ...props }) => {
6+
const getData = async ({ rules, ...props }) => {
107
const data = await Promise.all(
11-
map(rules, async ([propName, innerRules]) => {
8+
rules.map(async ([propName, innerRules]) => {
129
const duration = debug.duration()
1310
const value = await findRule(innerRules, props, propName)
14-
const normalizedValue = normalizeValue(value)
11+
const normalizedValue = has(value) ? value : null
1512
duration(`${propName}=${normalizedValue} rules=${innerRules.length}`)
1613
return [propName, normalizedValue]
1714
})
1815
)
1916

20-
return fromPairs(data)
17+
return Object.fromEntries(data)
2118
}
2219

2320
module.exports = getData

packages/metascraper/src/index.d.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,11 @@ declare namespace createMetascraper {
3434
* Ensure the URL provided is validated as a WHATWG URL API compliant.
3535
*/
3636
validateUrl?: boolean;
37+
/**
38+
* A Set of property names to omit from the metadata extraction process.
39+
* These properties will be filtered out before processing the rules.
40+
*/
41+
omitPropNames?: Set<string>;
3742
}
3843

3944
export interface Metadata {

packages/metascraper/src/index.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ module.exports = rules => {
1717
htmlDom,
1818
rules: inlineRules,
1919
validateUrl = true,
20+
omitPropNames = new Set(),
2021
...props
2122
} = {}) => {
2223
if (validateUrl && !isUrl(url)) {
@@ -29,7 +30,7 @@ module.exports = rules => {
2930
return getData({
3031
url,
3132
htmlDom: htmlDom ?? load(html, { baseURI: url }),
32-
rules: mergeRules(inlineRules, loadedRules),
33+
rules: mergeRules(inlineRules, loadedRules, omitPropNames),
3334
...props
3435
})
3536
}

packages/metascraper/src/rules.js

Lines changed: 64 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,57 +1,69 @@
11
'use strict'
22

3-
const {
4-
cloneDeep,
5-
concat,
6-
first,
7-
findIndex,
8-
forEach,
9-
chain,
10-
castArray,
11-
has,
12-
set
13-
} = require('lodash')
14-
15-
const forEachRule = (collection, fn) => forEach(castArray(collection), fn)
16-
17-
const loadRules = rulesBundle =>
18-
chain(rulesBundle)
19-
.reduce((acc, { test, pkgName, ...rules }) => {
20-
forEach(rules, (innerRules, propName) => {
21-
forEachRule(innerRules, rule => {
3+
const castArray = value => (Array.isArray(value) ? value : [value])
4+
5+
const loadRules = rulesBundle => {
6+
const acc = {}
7+
8+
for (const { test, pkgName, ...rules } of rulesBundle) {
9+
for (const [propName, innerRules] of Object.entries(rules)) {
10+
const processedRules = castArray(innerRules)
11+
if (test || pkgName) {
12+
for (const rule of processedRules) {
2213
if (test) rule.test = test
23-
rule.pkgName = pkgName ?? 'uknown'
24-
})
25-
26-
set(
27-
acc,
28-
propName,
29-
has(acc, propName)
30-
? concat(acc[propName], innerRules)
31-
: concat(innerRules)
32-
)
33-
34-
return acc
35-
})
36-
return acc
37-
}, {})
38-
.toPairs()
39-
.value()
40-
41-
const mergeRules = (rules, baseRules) =>
42-
chain(rules)
43-
.reduce((acc, { test, ...rules }) => {
44-
forEach(rules, (innerRules, propName) => {
45-
if (test) forEachRule(innerRules, rule => (rule.test = test))
46-
// find the rules associated with `propName`
47-
const index = findIndex(acc, item => first(item) === propName)
48-
// if `propName` has more rule, add the new rule from the end
49-
if (index !== -1) acc[index][1] = concat(innerRules, ...acc[index][1])
50-
// otherwise, create an array of rules
51-
else acc.push([propName, castArray(innerRules)])
52-
})
53-
return acc
54-
}, cloneDeep(baseRules))
55-
.value()
14+
if (pkgName) rule.pkgName = pkgName ?? 'unknown'
15+
}
16+
}
17+
18+
if (acc[propName]) {
19+
acc[propName].push(...processedRules)
20+
} else {
21+
acc[propName] = processedRules
22+
}
23+
}
24+
}
25+
return Object.entries(acc)
26+
}
27+
28+
const mergeRules = (rules, baseRules, omitPropNames = new Set()) => {
29+
const result = {}
30+
31+
// Process base rules first (shallow clone arrays only)
32+
for (const [propName, ruleArray] of baseRules) {
33+
if (!omitPropNames.has(propName)) {
34+
result[propName] = [...ruleArray] // Shallow clone array
35+
}
36+
}
37+
38+
// Handle case where rules might be null/undefined or not an array
39+
if (!rules || !Array.isArray(rules)) {
40+
return Object.entries(result)
41+
}
42+
43+
// Process inline rules
44+
for (const { test, ...ruleSet } of rules) {
45+
for (const [propName, innerRules] of Object.entries(ruleSet)) {
46+
if (omitPropNames.has(propName)) continue
47+
48+
const processedRules = Array.isArray(innerRules)
49+
? [...innerRules]
50+
: [innerRules]
51+
if (test) {
52+
for (const rule of processedRules) {
53+
rule.test = test
54+
}
55+
}
56+
57+
if (result[propName]) {
58+
// Prepend new rules to match original concat(innerRules, existing) behavior
59+
result[propName] = [...processedRules, ...result[propName]]
60+
} else {
61+
result[propName] = processedRules
62+
}
63+
}
64+
}
65+
66+
return Object.entries(result)
67+
}
5668

5769
module.exports = { mergeRules, loadRules }

packages/metascraper/test/integration/transistor/index.js

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ const url = 'https://share.transistor.fm/e/70c487ed'
2525

2626
test('transistor.fm', async t => {
2727
const html = await readFile(resolve(__dirname, 'input.html'))
28-
const metadata = await metascraper({ html, url })
28+
const { logo, ...metadata } = await metascraper({ html, url })
2929
t.snapshot(metadata)
30+
t.is(typeof logo, 'string')
31+
t.true(new URL(logo).hostname.endsWith('.gstatic.com'), logo)
3032
})

packages/metascraper/test/integration/transistor/snapshots/index.js.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ Generated by [AVA](https://avajs.dev).
1515
description: null,
1616
image: 'https://images.transistor.fm/file/transistor/images/episode/263474/medium_1590257639-artwork.jpg',
1717
lang: null,
18-
logo: 'https://t1.gstatic.com/faviconV2?client=SOCIAL&type=FAVICON&fallback_opts=TYPE,SIZE,URL&url=https://share.transistor.fm/e/70c487ed&size=128',
1918
publisher: '#032 – Before and After Product-Market Fit with Peter and Calvin from Segment',
2019
title: 'Transistor Embed | #032 – Before and After Product-Market Fit with Peter and Calvin from Segment',
2120
url: 'https://share.transistor.fm/e/70c487ed',
Binary file not shown.

0 commit comments

Comments
 (0)