Skip to content

Commit d0e3d11

Browse files
committed
feat(unicode): ensure 140 char text limit is unicode aware
1 parent 130da32 commit d0e3d11

File tree

3 files changed

+45
-2
lines changed

3 files changed

+45
-2
lines changed

helper/unicode.js

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
const _ = require('lodash');
22
const regenerate = require('regenerate');
3+
const unicodeToArray = require('lodash/_unicodeToArray');
34

45
// non-printable control characters
56
// ref: https://en.wikipedia.org/wiki/List_of_Unicode_characters
@@ -94,3 +95,31 @@ function normalize(str) {
9495
}
9596

9697
module.exports.normalize = normalize;
98+
99+
// unicode aware string length function
100+
// note: ported from 'npm stringz' using 'lodash' internals in place of 'char-regex'
101+
module.exports.length = (str) => {
102+
103+
// sanity checking
104+
if (!_.isString(str)) { throw new Error('invalid string'); }
105+
106+
// return count of unicode characters
107+
return unicodeToArray(str).length;
108+
};
109+
110+
// unicode aware substring function
111+
// note: ported from 'npm stringz' using 'lodash' internals in place of 'char-regex'
112+
module.exports.substring = (str, begin, end) => {
113+
114+
// sanity checking
115+
if (!_.isString(str)) { throw new Error('invalid string'); }
116+
117+
// Even though negative numbers work here, theyre not in the spec
118+
if (!_.isFinite(begin) || begin < 0) { begin = 0; }
119+
if (_.isFinite(end) && end < 0) { end = 0; }
120+
121+
const chars = unicodeToArray(str);
122+
if (chars.length === 0){ return ''; }
123+
124+
return chars.slice(begin, end).join('');
125+
};

sanitizer/_text.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@ function _sanitize( raw, clean ){
2121
if( !_.isString(text) || _.isEmpty(text) ){
2222
messages.errors.push(`invalid param 'text': text length, must be >0`);
2323
} else {
24-
if( text.length > MAX_TEXT_LENGTH ){
24+
if( unicode.length(text) > MAX_TEXT_LENGTH ){
2525
messages.warnings.push(`param 'text' truncated to ${MAX_TEXT_LENGTH} characters`);
26-
text = text.substring(0, MAX_TEXT_LENGTH);
26+
text = unicode.substring(text, 0, MAX_TEXT_LENGTH);
2727
}
2828
clean.text = text;
2929
}

test/unit/sanitizer/_text.js

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
const sanitizer = require('../../../sanitizer/_text')();
2+
const unicode = require('../../../helper/unicode');
23

34
module.exports.tests = {};
45

@@ -154,6 +155,19 @@ it again and again until we reach our destination.` };
154155
t.deepEquals(messages.warnings, [`param 'text' truncated to 140 characters`]);
155156
t.end();
156157
});
158+
159+
// https://github.com/pelias/api/issues/1574
160+
test('truncate should be unicode aware', (t) => {
161+
const raw = { text: 'a' + '👩‍❤️‍👩'.repeat(200) };
162+
const clean = {};
163+
const messages = sanitizer.sanitize(raw, clean);
164+
165+
t.equals(unicode.length(clean.text), 140);
166+
t.equals(clean.text, 'a' + '👩‍❤️‍👩'.repeat(139));
167+
t.deepEquals(messages.errors, [], 'no errors');
168+
t.deepEquals(messages.warnings, [`param 'text' truncated to 140 characters`]);
169+
t.end();
170+
});
157171
};
158172

159173
module.exports.all = (tape, common) => {

0 commit comments

Comments
 (0)