All code runs on latest master.
Literally, I just patched an re-ran these yesterday.
It's not intentional.
The result of ridiculous amounts of benchmarks and analysis.
Hoisting our functions
var SB = require('buffer').SlowBuffer;
var max = 3e4;
(function runTick() {
process.nextTick(function genPrimes() {
if (--max < 0) return;
var primes = [];
var len = (max >>> 3) + 1;
var sieve = new SB(len);
sieve.fill(0xff, 0, len);
var cntr, x, j;
for (cntr = 0, x = 2; x <= max; ++x) {
if (sieve[x >>> 3] & (1 << (x & 7))) {
primes[cntr++] = x
for(j = 2 * x; j <= max; j += x)
sieve[j >>> 3] &= ~(1 << (j & 7))
}
}
runTick();
});
}());
Runtime:
$ /usr/bin/time node primegen0.js 19.53user 0.00system 0:19.58elapsed 99%CPU (51040maxresident)k 0inputs+0outputs (0major+13104minor)pagefaults 0swaps
var SB = require('buffer').SlowBuffer;
function genPrimes(max) {
if (--max < 0) return;
var primes = [];
var len = (max >>> 3) + 1;
var sieve = new SB(len);
sieve.fill(0xff, 0, len);
var cntr, x, j;
for (cntr = 0, x = 2; x <= max; ++x) {
if (sieve[x >>> 3] & (1 << (x & 7))) {
primes[cntr++] = x
for(j = 2 * x; j <= max; j += x)
sieve[j >>> 3] &= ~(1 << (j & 7))
}
}
runTick();
}
var max = 3e4;
function runTick() {
process.nextTick(function() {
if (--max < 0) return;
genPrimes(max);
runTick();
});
}
runTick();
Runtime:
$ /usr/bin/time node primegen1.js 4.59user 0.04system 0:04.64elapsed 99%CPU (50540maxresident)k 0inputs+0outputs (0major+12977minor)pagefaults 0swaps
Here it was 3.3x's faster.
e.g. Why I hate UTF8
binary 16B 110.72 ns/op ascii 16B 110.06 ns/op N/A utf8 16B 115.04 ns/op N/A binary 1KB 157.29 ns/op ascii 1KB 246.49 ns/op -36.2% utf8 1KB 254.67 ns/op -38.2% binary 32KB 2.37 μs/op ascii 32KB 4.57 μs/op -92.8% utf8 32KB 5.13 μs/op -116.5% binary 128KB 8.79 μs/op ascii 128KB 17.00 μs/op -93.4% utf8 128KB 19.65 μs/op -123.5% binary 512KB 36.74 μs/op ascii 512KB 73.67 μs/op -100.5% utf8 512KB 104.23 μs/op -183.9%
binary 16B 111.21 ns/op utf8 16B 156.31 ns/op -40.5% binary 1KB 159.45 ns/op utf8 1KB 2.65 μs/op -1566.7% binary 32KB 2.41 μs/op utf8 32KB 130.33 μs/op -5306.7% binary 128KB 9.16 μs/op utf8 128KB 539.72 μs/op -5719.8% binary 512KB 37.89 μs/op utf8 512KB 2.61 ms/op -6788.4%
Max, this is for you : )
*cough* leveldb *cough*
Important Assumption: All keys fit in one byte space.
var size = 4;
// Create a ridiculous JSON object with utf8 values.
var json = {};
for (var i = 0; i < size; i++)
json[i] = 'foo:\u0222 bar:\u0223';
// First setup the data to look like it would after being read in
// from a file.
var json_string = JSON.stringify(json);
// We're going to assume the file was written in UTF8.
var json_buffer = new Buffer(json_string, 'utf8');
// Start working with the data.
var binary_json = json_buffer.toString('binary');
// Unfortunately JSON.parse always assumes UTF8.
// But I'm working on a hack for that.
var json_binary = JSON.parse(binary_json);
// Here we'll see the values look all fubar, but before they're
// written out to a stream we can convert them back to a Buffer using
// binary encoding and they'll have the same representation.
console.log(json_binary);
Output: Actual: { '0': 'foo:È¢ bar:È£', { '0': 'foo:Ȣ bar:ȣ', '1': 'foo:È¢ bar:È£', '1': 'foo:Ȣ bar:ȣ', '2': 'foo:È¢ bar:È£', '2': 'foo:Ȣ bar:ȣ', '3': 'foo:È¢ bar:È£' } '3': 'foo:Ȣ bar:ȣ' }
Yeah. Strange I know, but it'll all work out.
var db = require('levelup')('./mydb');
// Convert each value to Buffer before passing down the pipe
db.put(0, new Buffer(json_binary[0], 'binary'), function(err) {
db.get(0, function(err, val) {
console.log(val);
});
});
Output - foo:Ȣ bar:ȣ
We make the assumption that all HTTP headers are already in ASCII space, so no need for the additional parsing.
What happens to performance as string size grows?
Externalized Strings to the rescue!
(inflection point is around 1MB)
Mutable Strings!!!
void AfterFill(const FunctionCallbackInfo<Value>& args) {
HandleScope scope(args.GetIsolate());
Local<String> long_string = args[0].As<String>();
assert(long_string->IsString());
assert(args[1]->IsString());
String::AsciiValue fill_char(args[1]);
char* data = NULL;
size_t length = 0;
assert(node::StringBytes::GetExternalParts(
long_string, const_cast<const char**>(&data), &length));
memset(data, (*fill_char)[0], length);
}
var long_string = new Buffer(1024 * 1024).fill('a').toString('binary');
console.log(long_string.substr(0, 10));
afterFill(long_string, 'b');
console.log(long_string.substr(0, 10));
Output: aaaaaaaaaa bbbbbbbbbb
DON'T MUTATE YOUR STRINGS!!!
What could be so hard?
More than you think
static const uint8_t* length_sym;
static const uint8_t* type_sym;
static const uint8_t* used_sym;
void SetInCCSlow(const FunctionCallbackInfo<Value>& args) {
Isolate* isolate = args.GetIsolate();
HandleScope scope(isolate);
// Create a new Object to contain our properties.
Local<Object> obj = Object::New();
// Passing in three property types.
Local<Number> arg0 = args[0]->ToNumber();
Local<String> arg1 = args[1]->ToString();
Local<Boolean> arg2 = args[2]->ToBoolean();
// Set the Object properties.
obj->Set(String::NewFromOneByte(isolate, length_sym), arg0);
obj->Set(String::NewFromOneByte(isolate, type_sym), arg1);
obj->Set(String::NewFromOneByte(isolate, used_sym), arg2);
args.GetReturnValue().Set(obj);
}
// Done on initialize.
length_sym = reinterpret_cast<const uint8_t*>("length");
type_sym = reinterpret_cast<const uint8_t*>("type");
used_sym = reinterpret_cast<const uint8_t*>("used");
Persistent<String> p_length_sym;
Persistent<String> p_type_sym;
Persistent<String> p_used_sym;
// Only safe if the Persistent isn't Weak.
template<class T>
inline Local<T> ToLocal(Persistent<T>* p_) {
return *reinterpret_cast<Local<T>*>(p_);
}
void SetInCCSym(const FunctionCallbackInfo<Value>& args) {
Isolate* isolate = args.GetIsolate();
HandleScope scope(isolate);
Local<Object> obj = Object::New();
Local<Number> arg0 = args[0]->ToNumber();
Local<String> arg1 = args[1]->ToString();
Local<Boolean> arg2 = args[2]->ToBoolean();
// This time we'll use the Persistent Strings.
obj->Set(ToLocal<String>(&p_length_sym), arg0);
obj->Set(ToLocal<String>(&p_type_sym), arg1);
obj->Set(ToLocal<String>(&p_used_sym), arg2);
args.GetReturnValue().Set(obj);
}
// Done on initialize.
p_length_sym.Reset(isolate, String::NewFromOneByte(isolate, length_sym));
p_type_sym.Reset(isolate, String::NewFromOneByte(isolate, type_sym));
p_used_sym.Reset(isolate, String::NewFromOneByte(isolate, used_sym));
void SetInJS(const FunctionCallbackInfo<Value>& args) {
Isolate* isolate = args.GetIsolate();
HandleScope scope(isolate);
Local<Object> obj = Object::New();
Local<Number> arg0 = args[0]->ToNumber();
Local<String> arg1 = args[1]->ToString();
Local<Boolean> arg2 = args[2]->ToBoolean();
Local<Value> argv[3] = { arg0, arg1, arg2 };
// We've previously Persisted the below JS function.
ToLocal<Function>(&p_setprop_fn)->Call(obj, 3, argv);
args.GetReturnValue().Set(obj);
}
// object is created in cc and set as instance of function when called
function setProperties(length, type, used) {
this.length = length;
this.type = type;
this.used = used;
}
void SetInJSObj(const FunctionCallbackInfo<Value>& args) {
Isolate* isolate = args.GetIsolate();
HandleScope scope(isolate);
Local<Number> arg0 = args[0]->ToNumber();
Local<String> arg1 = args[1]->ToString();
Local<Boolean> arg2 = args[2]->ToBoolean();
Local<Value> argv[3] = { arg0, arg1, arg2 };
// Pass in previously persisted empty Object.
Local<Function> fn = ToLocal<Function>(&p_setobj_fn);
obj = fn->Call(obj, 3, argv);
args.GetReturnValue().Set(obj);
}
// returns new object setting object properties passed from cc
function setPropObj(length, type, used) {
return {
length: length,
type: type,
used: used
};
}
setInCCSlow - 1.56 μs/op setInCCSym - 809.65 ns/op setInJS - 370.62 ns/op setInJSObj - 148.61 ns/op
Even in the optimized C++ case, calling to JS is 4.5x's faster.
Now we enter the rabbit hole.
// Setup our object with named properties.
var obj = { length: 10, type: 'string', used: true };
accessNamed(obj, iter);
void AccessNamed(const FunctionCallbackInfo<Value>& args) {
Isolate* isolate = args.GetIsolate();
HandleScope scope(isolate);
Local<Object> obj = args[0].As<Object>();
size_t iter = args[1]->Uint32Value();
assert(obj->IsObject());
for (size_t i = 0; i < iter; i++) {
size_t length = obj->Get(ToLocal(&p_length_sym))->Uint32Value();
Local<String> type = obj->Get(ToLocal(&p_type_sym)).As<String>();
bool used = obj->Get(ToLocal(&p_used_sym))->IsTrue();
}
}
// Use indexed properties for values instead.
var obj = { 0: 10, 1: 'string', 2: true };
accessIndexed(obj, iter);
void AccessIndexed(const FunctionCallbackInfo<Value>& args) {
Isolate* isolate = args.GetIsolate();
HandleScope scope(isolate);
Local<Object> obj = args[0].As<Object>();
size_t iter = args[1]->Uint32Value();
assert(obj->IsObject());
for (size_t i = 0; i < iter; i++) {
size_t length = obj->Get(0)->Uint32Value();
Local<String> type = obj->Get(1).As<String>();
bool used = obj->Get(2)->IsTrue();
}
}
accessNamed - 256.78 ns/op accessIndexed - 71.02 ns/op 2.6x's faster
For the few that wonder what's going on in process.nextTick()
// This tickInfo thing is used so that the C++ code in src/node.cc
// can have easy accesss to our nextTick state, and avoid unnecessary
// calls into JS land.
var tickInfo = process._tickInfo;
// Done this way to make each placement much easier to remember.
var kInTick = 0;
var kIndex = 1;
var kLastThrew = 2;
var kLength = 3;
// ... further down ...
function nextTick(callback) {
nextTickQueue.push({
callback: callback,
domain: process.domain || null
});
// By manually tracking the queue length we can quickly alert C++
// whether there are callbacks to be processed or not.
tickInfo[kLength]++;
}
// We can access each flag without touching the v8 API.
if (tick_info->in_tick() == 1) {
return ret;
}
if (tick_info->length() == 0) {
tick_info->set_index(0);
return ret;
}
// Now that all flags have been checked, we can be confident there's
// no need to call into JS unnecessarily.
tick_callback_function->Call(process_object, 0, NULL);
Running the following in a custom build, counting the number of calls from MakeCallback
without then with the optimizations:
./node benchmark/net/net-c2s.js len=10240 type=buf dur=5
Without the optimization: ~106,000 calls/sec
With the optimization: ~14,000 calls/sec
That's pushing 100k calls into JS we've bypassed with this little trick.
Core has a reason for the insanity.
We want Node to stay out of your way.
If you want to dive into core, please feel free to ask me.
slides and examples: github.com/trevnorris/talks/
freenode: trevnorris
twitter: @trevnorris
you get the idea