I have been in loved with ROOT – CERN’s Data Analysis framework and it’s interactive C++ shell. But how fast is the code JITed by ROOT? I have find myself compiling ROOT scripts (aka. ACLiC, basically clang) more then I run them in the interpreter. So, let’s find that out today. Further more, how fast is ACLiC compared to a standalone compiler?
For the test, I’ll let ROOT and other compiler compiler and run SmallPT and see how long it for each of the program takes to render the scene at 1024×764 pixels, 4 samples per pixel. – For those who doesn’t know, SmallPT is a 99 line C++ PBR renderer. And is featured in the Phoronix test suite.
Running the tests n my main box.
CPU | AMD Ryzen 1700X 16 threads |
RAM | DDR4 2888MHz Dual Channel |
OS | Arch Linux, Kernel 5.1.14 |
Chipset | AMD FCH B350 |
The contenders are:
Compiler/Interpreter | Version |
GCC – O0/O1/O2/O3 (no OpenMP) | 9.1.0 |
clang – O0/O1/O2/O3 (no OpenMP) | 8.0.0 |
ROOT – JIT | 6.0.16 |
ROOT – ACLiC | 6.0.16 |
cling | 0.6 |
And I slightly modified SmallPT so modern clang can compile it and ROOT can run it.
Two locations are changed. A static_cast
is added so clang does not complain about down casting and a smallpt()
function is added for ROOT and cling to execute the renderer automatically.
#include <math.h> // smallpt, a Path Tracer by Kevin Beason, 2008
#include <stdlib.h> // Make : g++ -O3 -fopenmp smallpt.cpp -o smallpt
#include <stdio.h> // Remove "-fopenmp" for g++ version < 4.2
#include <chrono>
#include <iostream>
struct Vec { // Usage: time ./smallpt 5000 && xv image.ppm
double x, y, z; // position, also color (r,g,b)
Vec(double x_=0, double y_=0, double z_=0){ x=x_; y=y_; z=z_; }
Vec operator+(const Vec &b) const { return Vec(x+b.x,y+b.y,z+b.z); }
Vec operator-(const Vec &b) const { return Vec(x-b.x,y-b.y,z-b.z); }
Vec operator*(double b) const { return Vec(x*b,y*b,z*b); }
Vec mult(const Vec &b) const { return Vec(x*b.x,y*b.y,z*b.z); }
Vec& norm(){ return *this = *this * (1/sqrt(x*x+y*y+z*z)); }
double dot(const Vec &b) const { return x*b.x+y*b.y+z*b.z; } // cross:
Vec operator%(Vec&b){return Vec(y*b.z-z*b.y,z*b.x-x*b.z,x*b.y-y*b.x);}
};
struct Ray { Vec o, d; Ray(Vec o_, Vec d_) : o(o_), d(d_) {} };
enum Refl_t { DIFF, SPEC, REFR }; // material types, used in radiance()
struct Sphere {
double rad; // radius
Vec p, e, c; // position, emission, color
Refl_t refl; // reflection type (DIFFuse, SPECular, REFRactive)
Sphere(double rad_, Vec p_, Vec e_, Vec c_, Refl_t refl_):
rad(rad_), p(p_), e(e_), c(c_), refl(refl_) {}
double intersect(const Ray &r) const { // returns distance, 0 if nohit
Vec op = p-r.o; // Solve t^2*d.d + 2*t*(o-p).d + (o-p).(o-p)-R^2 = 0
double t, eps=1e-4, b=op.dot(r.d), det=b*b-op.dot(op)+rad*rad;
if (det<0) return 0; else det=sqrt(det);
return (t=b-det)>eps ? t : ((t=b+det)>eps ? t : 0);
}
};
Sphere spheres[] = {//Scene: radius, position, emission, color, material
Sphere(1e5, Vec( 1e5+1,40.8,81.6), Vec(),Vec(.75,.25,.25),DIFF),//Left
Sphere(1e5, Vec(-1e5+99,40.8,81.6),Vec(),Vec(.25,.25,.75),DIFF),//Rght
Sphere(1e5, Vec(50,40.8, 1e5), Vec(),Vec(.75,.75,.75),DIFF),//Back
Sphere(1e5, Vec(50,40.8,-1e5+170), Vec(),Vec(), DIFF),//Frnt
Sphere(1e5, Vec(50, 1e5, 81.6), Vec(),Vec(.75,.75,.75),DIFF),//Botm
Sphere(1e5, Vec(50,-1e5+81.6,81.6),Vec(),Vec(.75,.75,.75),DIFF),//Top
Sphere(16.5,Vec(27,16.5,47), Vec(),Vec(1,1,1)*.999, SPEC),//Mirr
Sphere(16.5,Vec(73,16.5,78), Vec(),Vec(1,1,1)*.999, REFR),//Glas
Sphere(600, Vec(50,681.6-.27,81.6),Vec(12,12,12), Vec(), DIFF) //Lite
};
inline double clamp(double x){ return x<0 ? 0 : x>1 ? 1 : x; }
inline int toInt(double x){ return int(pow(clamp(x),1/2.2)*255+.5); }
inline bool intersect(const Ray &r, double &t, int &id){
double n=sizeof(spheres)/sizeof(Sphere), d, inf=t=1e20;
for(int i=int(n);i--;) if((d=spheres[i].intersect(r))&&d<t){t=d;id=i;}
return t<inf;
}
Vec radiance(const Ray &r, int depth, unsigned short *Xi){
double t; // distance to intersection
int id=0; // id of intersected object
if (!intersect(r, t, id)) return Vec(); // if miss, return black
const Sphere &obj = spheres[id]; // the hit object
Vec x=r.o+r.d*t, n=(x-obj.p).norm(), nl=n.dot(r.d)<0?n:n*-1, f=obj.c;
double p = f.x>f.y && f.x>f.z ? f.x : f.y>f.z ? f.y : f.z; // max refl
if (++depth>5) if (erand48(Xi)<p) f=f*(1/p); else return obj.e; //R.R.
if (obj.refl == DIFF){ // Ideal DIFFUSE reflection
double r1=2*M_PI*erand48(Xi), r2=erand48(Xi), r2s=sqrt(r2);
Vec w=nl, u=((fabs(w.x)>.1?Vec(0,1):Vec(1))%w).norm(), v=w%u;
Vec d = (u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrt(1-r2)).norm();
return obj.e + f.mult(radiance(Ray(x,d),depth,Xi));
} else if (obj.refl == SPEC) // Ideal SPECULAR reflection
return obj.e + f.mult(radiance(Ray(x,r.d-n*2*n.dot(r.d)),depth,Xi));
Ray reflRay(x, r.d-n*2*n.dot(r.d)); // Ideal dielectric REFRACTION
bool into = n.dot(nl)>0; // Ray from outside going in?
double nc=1, nt=1.5, nnt=into?nc/nt:nt/nc, ddn=r.d.dot(nl), cos2t;
if ((cos2t=1-nnt*nnt*(1-ddn*ddn))<0) // Total internal reflection
return obj.e + f.mult(radiance(reflRay,depth,Xi));
Vec tdir = (r.d*nnt - n*((into?1:-1)*(ddn*nnt+sqrt(cos2t)))).norm();
double a=nt-nc, b=nt+nc, R0=a*a/(b*b), c = 1-(into?-ddn:tdir.dot(n));
double Re=R0+(1-R0)*c*c*c*c*c,Tr=1-Re,P=.25+.5*Re,RP=Re/P,TP=Tr/(1-P);
return obj.e + f.mult(depth>2 ? (erand48(Xi)<P ? // Russian roulette
radiance(reflRay,depth,Xi)*RP:radiance(Ray(x,tdir),depth,Xi)*TP) :
radiance(reflRay,depth,Xi)*Re+radiance(Ray(x,tdir),depth,Xi)*Tr);
}
int main(int argc, char *argv[]){
int w=1024, h=768, samps = argc==2 ? atoi(argv[1])/4 : 1; // # samples
Ray cam(Vec(50,52,295.6), Vec(0,-0.042612,-1).norm()); // cam pos, dir
Vec cx=Vec(w*.5135/h), cy=(cx%cam.d).norm()*.5135, r, *c=new Vec[w*h];
#pragma omp parallel for schedule(dynamic, 1) private(r) // OpenMP
for (int y=0; y<h; y++){ // Loop over image rows
fprintf(stderr,"\rRendering (%d spp) %5.2f%%",samps*4,100.*y/(h-1));
for (unsigned short x=0, Xi[3]={0,0,static_cast<uint16_t>(y*y*y)}; x<w; x++) // Loop cols
for (int sy=0, i=(h-y-1)*w+x; sy<2; sy++) // 2x2 subpixel rows
for (int sx=0; sx<2; sx++, r=Vec()){ // 2x2 subpixel cols
for (int s=0; s<samps; s++){
double r1=2*erand48(Xi), dx=r1<1 ? sqrt(r1)-1: 1-sqrt(2-r1);
double r2=2*erand48(Xi), dy=r2<1 ? sqrt(r2)-1: 1-sqrt(2-r2);
Vec d = cx*( ( (sx+.5 + dx)/2 + x)/w - .5) +
cy*( ( (sy+.5 + dy)/2 + y)/h - .5) + cam.d;
r = r + radiance(Ray(cam.o+d*140,d.norm()),0,Xi)*(1./samps);
} // Camera rays are pushed ^^^^^ forward to start in interior
c[i] = c[i] + Vec(clamp(r.x),clamp(r.y),clamp(r.z))*.25;
}
}
FILE *f = fopen("image.ppm", "w"); // Write image to PPM file.
fprintf(f, "P3\n%d %d\n%d\n", w, h, 255);
for (int i=0; i<w*h; i++)
fprintf(f,"%d %d %d ", toInt(c[i].x), toInt(c[i].y), toInt(c[i].z));
}
void smallpt()
{
const char* argv[] = {"smallpt"};
auto t1 = std::chrono::high_resolution_clock::now();
main(1, (char**)argv);
auto t2 = std::chrono::high_resolution_clock::now();
std::cout << '\n' << std::chrono::duration<float>(t2-t1).count() << std::endl;
}
For the compilers, I simply compile SmallPT and use the time
command to time it. And for ROOT and cling, the C++11 timers are used.
And…. Here are the results! (click to enlarge)
Not surprisingly that ROOT’s JIT is quite slow. In fact it is slower then GCC and clang’s -O0 compilation. Not surprising after all, there’s not much you can do when you are running while compilation while ROOT have the added overhead to generate code while running. And seems cling 0.6 have received some optimization that haven’t gone into ROOT. It is that tiny bit faster.. Great!
Conclusion
ROOT’s C++ interpreter is fast (and definitely JITed, you can’t interpret byte code this fast). But it is nothing close to what a proper compiler can do. So I highly recommend to compile you C++ scripts in production. So you catch error early and runs faster.
TL;DR
ROOT’s JIT is not as fast a a compiler. But very good compared to languages like Python and PHP (obviously).
Compiler/arguments | Time (seconds) |
GCC -O0 | 18.197 |
clang -O0 | 18.227 |
GCC -O1 | 5.384 |
clang -O1 | 9.537 |
GCC -O2 | 4.954 |
clang -O2 | 5.617 |
GCC -O3 | 4.999 |
clang -O3 | 5.583 |
ROOT – JIT | 22.4223 |
ROOT – ACLiC | 4.86559 |
cling | 21.3674 |
`cppyy` uses cling. I came across this post wondering if the code it generates will be fast (that is, excluding the compilation time). It is! It’s about as fast as clang `-O2` or `-O3`. My conclusion: For python code that’s running in an environment with a C++ compiler, cppyy lets you run C++ code at native speed without monkeying around with Python’s build system to compile a module.
“`
import cppyy
cppyy.cppdef(open(“smallpt.cpp”).read())
cppyy.gbl.smallpt()
“`
LikeLike