8000 buffer: optimize Buffer.byteLength by brendanashworth · Pull Request #1713 · nodejs/node · GitHub
[go: up one dir, main page]

Skip to content
55 changes: 55 additions & 0 deletions benchmark/buffers/buffer-bytelength.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
var common = require('../common');

var bench = common.createBenchmark(main, {
encoding: ['utf8', 'base64'],
len: [1, 2, 4, 16, 64, 256], // x16
n: [5e6]
});

// 16 chars each
var chars = [
'hello brendan!!!', // 1 byte
'ΰαβγδεζηθικλμνξο', // 2 bytes
'挰挱挲挳挴挵挶挷挸挹挺挻挼挽挾挿', // 3 bytes
'𠜎𠜱𠝹𠱓𠱸𠲖𠳏𠳕𠴕𠵼𠵿𠸎𠸏𠹷𠺝𠺢' // 4 bytes
];

function main(conf) {
var n = conf.n | 0;
var len = conf.len | 0;
var encoding = conf.encoding;

var strings = [];
for (var string of chars) {
// Strings must be built differently, depending on encoding
var data = buildString(string, len);
if (encoding === 'utf8') {
strings.push(data);
} else if (encoding === 'base64') {
// Base64 strings will be much longer than their UTF8 counterparts
strings.push(new Buffer(data, 'utf8').toString('base64'));
}
}

// Check the result to ensure it is *properly* optimized
var results = strings.map(function(val) {
return Buffer.byteLength(val, encoding);
});

bench.start();
for (var i = 0; i < n; i++) {
var index = n % strings.length;
// Go!
var r = Buffer.byteLength(strings[index], encoding);

if (r !== results[index])
throw Error('incorrect return value');
}
bench.end(n);
}

function buildString(str, times) {
if (times == 1) return str;

return str + buildString(str, times - 1);
}
66 changes: 49 additions & 17 deletions lib/buffer.js
Original file line number Diff line number Diff line change
Expand Up @@ -272,30 +272,62 @@ Buffer.concat = function(list, length) {
};


function base64ByteLength(str, bytes) {
// Handle padding
if (str.charCodeAt(bytes - 1) === 0x3D)
bytes--;
if (bytes > 1 && str.charCodeAt(bytes - 1) === 0x3D)
bytes--;

// Base64 ratio: 3/4
return (bytes * 3) >>> 2;
}


function byteLength(string, encoding) {
if (typeof(string) !== 'string')
string = String(string);
if (typeof string !== 'string')
string = '' + string;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this a change in functionality? ByteLength() would throw if a non string was passed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nope, String(val) also does coercion. We seem to prefer '' + val though.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bnoordhuis do we care if node aborts if someone does:

process.binding('buffer').byteLengthUtf8(42);

Or do we want to continue throwing?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think so. Fooling around with process.binding() voids the warranty.


if (string.length === 0)
var len = string.length;
if (len === 0)
return 0;

switch (encoding) {
case 'ascii':
case 'binary':
case 'raw':
return string.length;
// Use a for loop to avoid recursion
var loweredCase = false;
for (;;) {
switch (encoding) {
case 'ascii':
case 'binary':
// Deprecated
case 'raw':
case 'raws':
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was supported by the C++ version, but isn't currently supported by Buffer.isEncoding.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW, I think 'raw' and 'raws' have been deprecated since v0.2 or v0.3. I've never seen them being used in the wild either.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can follow this PR up with a deprecation commit (to keep it semver-patch) - technically speaking only raws was deprecated, not raw, for this function:

> Buffer.byteLength('abc', 'raw')
3
> Buffer.byteLength('abc', 'raws')
'raws' encoding has been renamed to 'binary'. Please update your code.
3

return len;

case 'ucs2':
case 'ucs-2':
case 'utf16le':
case 'utf-16le':
return string.length * 2;
case 'utf8':
case 'utf-8':
return binding.byteLengthUtf8(string);

case 'hex':
return string.length >>> 1;
}
case 'ucs2':
case 'ucs-2':
case 'utf16le':
case 'utf-16le':
return len * 2;

case 'hex':
return len >>> 1;

return binding.byteLength(string, encoding);
case 'base64':
return base64ByteLength(string, len);

default:
// The C++ binding defaulted to UTF8, we should too.
if (loweredCase)
return binding.byteLengthUtf8(string);

encoding = ('' + encoding).toLowerCase();
loweredCase = true;
}
}
}

Buffer.byteLength = byteLength;
Expand Down
16 changes: 5 additions & 11 deletions src/node_buffer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -541,17 +541,11 @@ void WriteDoubleBE(const FunctionCallbackInfo<Value>& args) {
}


void ByteLength(const FunctionCallbackInfo<Value> &args) {
Environment* env = Environment::GetCurrent(args);

if (!args[0]->IsString())
return env->ThrowTypeError("Argument must be a string");

Local<String> s = args[0]->ToString(env->isolate());
enum encoding e = ParseEncoding(env->isolate(), args[1], UTF8);
void ByteLengthUtf8(const FunctionCallbackInfo<Value> &args) {
CHECK(args[0]->IsString());

uint32_t size = StringBytes::Size(env->isolate(), s, e);
args.GetReturnValue().Set(size);
// Fast case: avoid StringBytes on UTF8 string. Jump to v8.
args.GetReturnValue().Set(args[0].As<String>()->Utf8Length());
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you can reduce this function to just:

void ByteLengthUtf8(const FunctionCallbackInfo<Value>& args) {
  CHECK(args[0]->IsString());
  args.GetReturnValue().Set(args[0].As<String>()->Utf8Length());
}

EDIT: s/UTF8/Utf8/ for consistency with other functions/types operating on UTF-8.



Expand Down Expand Up @@ -745,7 +739,7 @@ void Initialize(Handle<Object> target,

env->SetMethod(target, "setupBufferJS", SetupBufferJS);

env->SetMethod(target, "byteLength", ByteLength);
env->SetMethod(target, "byteLengthUtf8"< 6D3F /span>, ByteLengthUtf8);
env->SetMethod(target, "compare", Compare);
env->SetMethod(target, "fill", Fill);
env->SetMethod(target, "indexOfBuffer", IndexOfBuffer);
Expand Down
46 changes: 46 additions & 0 deletions test/parallel/test-buffer-bytelength.js
9E81
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
'use strict';

var common = require('../common');
var assert = require('assert');
var Buffer = require('buffer').Buffer;

// coerce values to string
assert.equal(Buffer.byteLength(32, 'raw'), 2);
assert.equal(Buffer.byteLength(NaN, 'utf8'), 3);
assert.equal(Buffer.byteLength({}, 'raws'), 15);
assert.equal(Buffer.byteLength(), 9);

// special case: zero length string
assert.equal(Buffer.byteLength('', 'ascii'), 0);
assert.equal(Buffer.byteLength('', 'HeX'), 0);

// utf8
assert.equal(Buffer.byteLength('∑éllö wørl∂!', 'utf-8'), 19);
assert.equal(Buffer.byteLength('κλμνξο', 'utf8'), 12);
assert.equal(Buffer.byteLength('挵挶挷挸挹', 'utf-8'), 15);
assert.equal(Buffer.byteLength('𠝹𠱓𠱸', 'UTF8'), 12);
// without an encoding, utf8 should be assumed
assert.equal(Buffer.byteLength('hey there'), 9);
assert.equal(Buffer.byteLength('𠱸挶νξ#xx :)'), 17);
assert.equal(Buffer.byteLength('hello world', ''), 11);
// it should also be assumed with unrecognized encoding
assert.equal(Buffer.byteLength('hello world', 'abc'), 11);
assert.equal(Buffer.byteLength('ßœ∑≈', 'unkn0wn enc0ding'), 10);

// base64
assert.equal(Buffer.byteLength('aGVsbG8gd29ybGQ=', 'base64'), 11);
assert.equal(Buffer.byteLength('bm9kZS5qcyByb2NrcyE=', 'base64'), 14);
assert.equal(Buffer.byteLength('aGkk', 'base64'), 3);
assert.equal(Buffer.byteLength('bHNrZGZsa3NqZmtsc2xrZmFqc2RsZmtqcw==',
'base64'), 25);
// special padding
assert.equal(Buffer.byteLength('aaa=', 'base64'), 2);
assert.equal(Buffer.byteLength('aaaa==', 'base64'), 3);

assert.equal(Buffer.byteLength('Il était tué'), 14);
assert.equal(Buffer.byteLength('Il était tué', 'utf8'), 14);
assert.equal(Buffer.byteLength('Il était tué', 'ascii'), 12);
assert.equal(Buffer.byteLength('Il était tué', 'binary'), 12);
['ucs2', 'ucs-2', 'utf16le', 'utf-16le'].forEach(function(encoding) {
assert.equal(24, Buffer.byteLength('Il était tué', encoding));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"He was killed"?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just copied it from the old test-buffer.js.

});
13 changes: 0 additions & 13 deletions test/parallel/test-buffer.js
Original file line number Diff line number Diff line change
Expand Up @@ -560,15 +560,6 @@ assert.equal(sb, s);
b = new Buffer('abcde');
assert.equal('bcde', b.slice(1).toString());

// byte length
assert.equal(14, Buffer.byteLength('Il était tué'));
assert.equal(14, Buffer.byteLength('Il était tué', 'utf8'));
['ucs2', 'ucs-2', 'utf16le', 'utf-16le'].forEach(function(encoding) {
assert.equal(24, Buffer.byteLength('Il était tué', encoding));
});
assert.equal(12, Buffer.byteLength('Il était tué', 'ascii'));
assert.equal(12, Buffer.byteLength('Il était tué', 'binary'));

// slice(0,0).length === 0
assert.equal(0, Buffer('hello').slice(0, 0).length);

Expand Down Expand Up @@ -1073,10 +1064,6 @@ assert.equal(buf.readInt8(0), -1);
assert.ok(typeof Buffer(5).slice(0, 5).parent === 'object');
})();

// Make sure byteLength properly checks for base64 padding
assert.equal(Buffer.byteLength('aaa=', 'base64'), 2);
assert.equal(Buffer.byteLength('aaaa==', 'base64'), 3);

// Regression test for #5482: should throw but not assert in C++ land.
assert.throws(function() {
Buffer('', 'buffer');
Expand Down
0