Support for Web platforms (#340)

* Implement cycleclock::Now for PNaCl

* Make cycleclock::Now compatible with NaCl/ARM

* Support Emscripten (Asm.js, WebAssembly)

* Rearrange #ifs from to handle specific cases first

* DoNotOptimize without inline asm for Emscripten & PNaCl
diff --git a/include/benchmark/benchmark_api.h b/include/benchmark/benchmark_api.h
index f953e59..66cbd7e 100644
--- a/include/benchmark/benchmark_api.h
+++ b/include/benchmark/benchmark_api.h
@@ -230,7 +230,7 @@
 // expression from being optimized away by the compiler. This function is
 // intended to add little to no overhead.
 // See: https://youtu.be/nXaxk27zwlk?t=2441
-#if defined(__GNUC__)
+#if defined(__GNUC__) && !defined(__pnacl__) && !defined(EMSCRIPTEN)
 template <class Tp>
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
   asm volatile("" : : "g"(value) : "memory");
diff --git a/src/cycleclock.h b/src/cycleclock.h
index ca26cca..e0f9b01 100644
--- a/src/cycleclock.h
+++ b/src/cycleclock.h
@@ -43,6 +43,11 @@
 
 #ifndef BENCHMARK_OS_WINDOWS
 #include <sys/time.h>
+#include <time.h>
+#endif
+
+#ifdef BENCHMARK_OS_EMSCRIPTEN
+#include <emscripten.h>
 #endif
 
 namespace benchmark {
@@ -65,6 +70,10 @@
   // counter pauses; it does not continue counting, nor does it
   // reset to zero.
   return mach_absolute_time();
+#elif defined(BENCHMARK_OS_EMSCRIPTEN)
+  // this goes above x86-specific code because old versions of Emscripten
+  // define __x86_64__, although they have nothing to do with it.
+  return static_cast<int64_t>(emscripten_get_now() * 1e+6);
 #elif defined(__i386__)
   int64_t ret;
   __asm__ volatile("rdtsc" : "=A"(ret));
@@ -99,6 +108,22 @@
   _asm rdtsc
 #elif defined(COMPILER_MSVC)
   return __rdtsc();
+#elif defined(BENCHMARK_OS_NACL)
+  // Native Client validator on x86/x86-64 allows RDTSC instructions,
+  // and this case is handled above. Native Client validator on ARM
+  // rejects MRC instructions (used in the ARM-specific sequence below),
+  // so we handle it here. Portable Native Client compiles to
+  // architecture-agnostic bytecode, which doesn't provide any
+  // cycle counter access mnemonics.
+
+  // Native Client does not provide any API to access cycle counter.
+  // Use clock_gettime(CLOCK_MONOTONIC, ...) instead of gettimeofday
+  // because is provides nanosecond resolution (which is noticable at
+  // least for PNaCl modules running on x86 Mac & Linux).
+  // Initialize to always return 0 if clock_gettime fails.
+  struct timespec ts = { 0, 0 };
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return static_cast<int64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
 #elif defined(__aarch64__)
   // System timer of ARMv8 runs at a different frequency than the CPU's.
   // The frequency is fixed, typically in the range 1-50MHz.  It can be
@@ -108,7 +133,9 @@
   asm volatile("mrs %0, cntvct_el0" : "=r"(virtual_timer_value));
   return virtual_timer_value;
 #elif defined(__ARM_ARCH)
-#if (__ARM_ARCH >= 6)  // V6 is the earliest arch that has a standard cyclecount
+  // V6 is the earliest arch that has a standard cyclecount
+  // Native Client validator doesn't allow MRC instructions.
+#if (__ARM_ARCH >= 6)
   uint32_t pmccntr;
   uint32_t pmuseren;
   uint32_t pmcntenset;
diff --git a/src/internal_macros.h b/src/internal_macros.h
index 2b3f32f..ab9dd85 100644
--- a/src/internal_macros.h
+++ b/src/internal_macros.h
@@ -41,6 +41,10 @@
 #define BENCHMARK_OS_FREEBSD 1
 #elif defined(__linux__)
 #define BENCHMARK_OS_LINUX 1
+#elif defined(__native_client__)
+#define BENCHMARK_OS_NACL 1
+#elif defined(EMSCRIPTEN)
+#define BENCHMARK_OS_EMSCRIPTEN 1
 #endif
 
 #if !__has_feature(cxx_exceptions) && !defined(__cpp_exceptions) \
diff --git a/src/timers.cc b/src/timers.cc
index fadc08f..8d56e8a 100644
--- a/src/timers.cc
+++ b/src/timers.cc
@@ -35,6 +35,10 @@
 #endif
 #endif
 
+#ifdef BENCHMARK_OS_EMSCRIPTEN
+#include <emscripten.h>
+#endif
+
 #include <cerrno>
 #include <cstdint>
 #include <cstdio>
@@ -100,14 +104,7 @@
 }  // end namespace
 
 double ProcessCPUUsage() {
-// FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See
-// https://github.com/google/benchmark/pull/292
-#if defined(CLOCK_PROCESS_CPUTIME_ID) && !defined(BENCHMARK_OS_MACOSX)
-  struct timespec spec;
-  if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &spec) == 0)
-    return MakeTime(spec);
-  DiagnoseAndExit("clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) failed");
-#elif defined(BENCHMARK_OS_WINDOWS)
+#if defined(BENCHMARK_OS_WINDOWS)
   HANDLE proc = GetCurrentProcess();
   FILETIME creation_time;
   FILETIME exit_time;
@@ -117,21 +114,28 @@
                       &user_time))
     return MakeTime(kernel_time, user_time);
   DiagnoseAndExit("GetProccessTimes() failed");
+#elif defined(BENCHMARK_OS_EMSCRIPTEN)
+  // clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) returns 0 on Emscripten.
+  // Use Emscripten-specific API. Reported CPU time would be exactly the
+  // same as total time, but this is ok because there aren't long-latency
+  // syncronous system calls in Emscripten.
+  return emscripten_get_now() * 1e-3;
+#elif defined(CLOCK_PROCESS_CPUTIME_ID) && !defined(BENCHMARK_OS_MACOSX)
+  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See
+  // https://github.com/google/benchmark/pull/292
+  struct timespec spec;
+  if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &spec) == 0)
+    return MakeTime(spec);
+  DiagnoseAndExit("clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) failed");
 #else
   struct rusage ru;
   if (getrusage(RUSAGE_SELF, &ru) == 0) return MakeTime(ru);
-  DiagnoseAndExit("clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) failed");
+  DiagnoseAndExit("getrusage(RUSAGE_SELF, ...) failed");
 #endif
 }
 
 double ThreadCPUUsage() {
-// FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See
-// https://github.com/google/benchmark/pull/292
-#if defined(CLOCK_THREAD_CPUTIME_ID) && !defined(BENCHMARK_OS_MACOSX)
-  struct timespec ts;
-  if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) == 0) return MakeTime(ts);
-  DiagnoseAndExit("clock_gettime(CLOCK_THREAD_CPUTIME_ID, ...) failed");
-#elif defined(BENCHMARK_OS_WINDOWS)
+#if defined(BENCHMARK_OS_WINDOWS)
   HANDLE this_thread = GetCurrentThread();
   FILETIME creation_time;
   FILETIME exit_time;
@@ -141,6 +145,8 @@
                  &user_time);
   return MakeTime(kernel_time, user_time);
 #elif defined(BENCHMARK_OS_MACOSX)
+  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See
+  // https://github.com/google/benchmark/pull/292
   mach_msg_type_number_t count = THREAD_BASIC_INFO_COUNT;
   thread_basic_info_data_t info;
   mach_port_t thread = pthread_mach_thread_np(pthread_self());
@@ -149,6 +155,13 @@
     return MakeTime(info);
   }
   DiagnoseAndExit("ThreadCPUUsage() failed when evaluating thread_info");
+#elif defined(BENCHMARK_OS_EMSCRIPTEN)
+  // Emscripten doesn't support traditional threads
+  return ProcessCPUUsage();
+#elif defined(CLOCK_THREAD_CPUTIME_ID)
+  struct timespec ts;
+  if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) == 0) return MakeTime(ts);
+  DiagnoseAndExit("clock_gettime(CLOCK_THREAD_CPUTIME_ID, ...) failed");
 #else
 #error Per-thread timing is not available on your system.
 #endif