diff --git a/_includes/side-nav-fast.html b/_includes/side-nav-fast.html
index 15002002..4768352e 100644
--- a/_includes/side-nav-fast.html
+++ b/_includes/side-nav-fast.html
@@ -2,6 +2,11 @@
    <li><h5 class="doc-side-nav-title">
     <a href="{{ site.baseurl }}/fast/">Performance Guide</a></h5>
    </li>
+   <li>
+     <h6 class="doc-side-nav-list-item">
+       <a href="{{ site.baseurl }}/fast/hints.html">Performance Hints</a>
+     </h6>
+   </li>
    <li><h6 class="doc-side-nav-title">Fast Tips</h6></li>
    {% assign sorted_posts = site.posts | sort: 'order' %}
 
diff --git a/_layouts/base.html b/_layouts/base.html
index a9a78c97..aa99cb60 100644
--- a/_layouts/base.html
+++ b/_layouts/base.html
@@ -19,6 +19,9 @@
 
     <!-- Site style using SASS, generated by from style.scss -->
     <link rel="stylesheet" href="{{ site.baseurl }}/css/style.css">
+    {% if page.customcss %}
+    <link rel="stylesheet" href="{{ site.baseurl }}/css/{{page.customcss}}.css">
+    {% endif %}
 
     <!-- Slick styling -->
     <link rel="stylesheet" type="text/css" href="//cdn.jsdelivr.net/jquery.slick/1.6.0/slick.css"/>
diff --git a/_layouts/perf_hints.html b/_layouts/perf_hints.html
new file mode 100644
index 00000000..d8874043
--- /dev/null
+++ b/_layouts/perf_hints.html
@@ -0,0 +1,17 @@
+---
+layout: default
+---
+
+{% assign current = page.url | downcase | split: '/' %}
+<div class="container">
+  <div class="row">
+    <div class="col-md-11 nofloat center-block ">
+      <div class="col-sm-1"></div>
+      <div class="col-sm-9 {{ page.type }}">
+        {{ content }}
+      </div>
+      <div id="toc" class="toc col-sm-2"></div>
+    </div>
+  </div>
+</div>
+
diff --git a/css/perf_hints.css b/css/perf_hints.css
new file mode 100644
index 00000000..8ec3b94f
--- /dev/null
+++ b/css/perf_hints.css
@@ -0,0 +1,28 @@
+/* Reset some styling needed elsewhere */
+    body {
+        margin-right: 0rem !important;
+    }
+
+summary {
+  display: list-item;
+  background: #fafafa;
+  border-bottom: .0625rem #ccc dashed;
+  border-top: .0625rem #ccc dashed;
+  outline: none;
+  padding: .5625rem 0;
+  padding-left: 0.5rem;
+}
+
+.doc-side-nav {
+  display: none;
+}
+
+.markdown ul, .markdown ol {
+  list-style-position: outside;
+}
+
+/* Hide external link icon */
+a.external::after {
+  display: none;
+}
+
diff --git a/fast/hints.md b/fast/hints.md
new file mode 100644
index 00000000..96191b8c
--- /dev/null
+++ b/fast/hints.md
@@ -0,0 +1,6369 @@
+---
+title: "Performance Hints"
+layout: perf_hints
+sidenav: side-nav-fast.html
+customcss: perf_hints
+customjs: /js/perf_hints.js
+type: markdown
+---
+<!-- Code generated by tool. DO NOT EDIT. -->
+<!-- mdformat global-off -->
+# Performance Hints
+
+
+[Jeff Dean](https://research.google/people/jeff/),
+[Sanjay Ghemawat](https://research.google/people/sanjayghemawat/)
+
+
+
+Original version: 2023/07/27, last updated: 2025/12/16
+
+<button id="expand-details-button">Expand all details</button>
+<button id="collapse-details-button">Collapse all details</button>
+
+
+<style>
+body {
+  font-family: Arial, sans-serif;
+}
+#toc {
+  font-size: 1.6rem;
+}
+@media (min-width: 800px) {
+  body {
+    margin-right: 17rem;
+  }
+  #toc {
+    display: none;
+  }
+  #toc + ul {
+    border-left: 1px solid black;
+    position: fixed;
+    right: 0;
+    top: 0;
+    width: 15rem;
+    height: -webkit-fill-available;
+    margin-top: 0;
+    padding-top: 1em;
+    padding-left: 1em;
+    list-style-type: none;
+    overflow-x: clip;
+    overflow-y: hidden;
+    scrollbar-gutter: stable;
+  }
+  #toc + ul:hover {
+    overflow-y: scroll;
+  }
+}
+a { text-decoration: none; }
+details { padding-left: 5px; }
+details summary > * { display: inline; }
+details summary {
+  margin: 0;
+}
+details > p {
+  margin: 10px 10px 2px 10px;
+}
+details > div {
+  margin-left: 10px;
+}
+details > pre {
+  margin-left: 10px;
+}
+.g3doc-search {
+  display: none;
+}
+.g3doc-page-links {
+  display: none;
+}
+.toc > ul > li {
+  font-size: 125%;
+}
+.toc > ul > li > ul > li {
+  font-size: 80%;
+}
+
+details > div > p {
+  padding: 0;
+}
+details > div > pre {
+  padding: 0;
+}
+pre.chroma .c1 { color: #6000f0; }
+pre.chroma .k { color: #00f; }
+pre.chroma .kd { color: #00f; }
+pre.chroma .kt { color: #00f; }
+pre.chroma .s { color: #a02000; }
+.old {
+  background: #fee;
+  padding-left: 5px;
+  border-top: 1px solid #aaa;
+}
+.old pre {
+  background: #fee !important;
+  position: relative;
+  resize: none;
+}
+.old pre::before {
+  content: "Old";
+  position: absolute;
+  right: 5px;
+  top: 0;
+  overflow: hidden;
+  alignment-baseline: top;
+}
+.new {
+  background: #efe;
+  padding-left: 5px;
+  border-top: 1px solid #aaa;
+  border-bottom: 1px solid #aaa;
+}
+.new pre {
+  background: #efe !important;
+  position: relative;
+  resize: none;
+}
+.new pre::before {
+  content: "New";
+  position: absolute;
+  right: 5px;
+  top: 0;
+  overflow: hidden;
+  alignment-baseline: top;
+}
+
+.bench {
+  background: #efe;
+  position: relative;
+  margin-top: 1ex !important;
+}
+.bench::before {
+  content: "Benchmark results";
+  position: absolute;
+  right: 5px;
+  top: 2px;
+  alignment-baseline: top;
+}
+/* Shrink space usage for diff regions */
+details .g3doc-zippy-region .g3doc-clipboard-btn {
+  display: none;
+}
+details[open] {
+  border: 1px solid black;
+}
+details pre {
+  padding: 2px;
+}
+
+.example {
+  margin: 0.5rem 0;
+  padding: 0.5rem;
+  padding-left: 1.125rem;
+  border: 0.0625rem #dadce0 solid;
+  border-radius: 0.25rem;
+}
+.example p {
+  margin: 0;
+}
+
+/* Do not show edit buttons since this is generated code. */
+.g3doc-edit-link { display: none; }
+.g3doc-page-links-button { display: none; }
+</style>
+
+
+
+Over the years, we (Jeff & Sanjay) have done a fair bit of diving into
+performance tuning of various pieces of code, and improving the
+performance of our software  has been important from the very earliest days of Google, since it
+lets us do more for more users. We wrote this document as a way of identifying
+some general principles and specific techniques that we use when doing this sort
+of work, and tried to pick illustrative source code changes (change lists, or
+CLs) that provide examples of the various approaches and techniques. Most of the
+concrete suggestions below reference C++ types and CLs, but the general
+principles apply to other languages. The document focuses on general performance
+tuning in the context of a single binary, and does not cover distributed systems
+or machine learning (ML) hardware performance tuning (huge areas unto
+themselves). We hope others will find this useful.
+
+*Many of the examples in the document have code fragments that demonstrate the
+techniques (click the little triangles!).* *Note that some of these
+code fragments mention various internal Google codebase abstractions. We have
+included these anyway if we felt like the examples were self-contained enough to
+be understandable to those unfamiliar with the details of those abstractions.*
+
+
+## The importance of thinking about performance {#the-importance-of-thinking-about-performance}
+
+Knuth is often quoted out of context as saying *premature optimization is the
+root of all evil*. The
+[full quote](https://dl.acm.org/doi/pdf/10.1145/356635.356640) reads: *"We
+should forget about small efficiencies, say about 97% of the time: premature
+optimization is the root of all evil. Yet we should not pass up our
+opportunities in that critical 3%."* This document is about that critical
+3%, and a more compelling quote,
+again from Knuth, reads:
+
+> The improvement in speed from Example 2 to Example 2a is only about 12%, and
+> many people would pronounce that insignificant. The conventional wisdom shared
+> by many of today's software engineers calls for ignoring efficiency in the
+> small; but I believe this is simply an overreaction to the abuses they see
+> being practiced by penny-wise-and-pound-foolish programmers, who can't debug
+> or maintain their "optimized" programs. In established engineering disciplines
+> a 12% improvement, easily obtained, is never considered marginal; and I
+> believe the same viewpoint should prevail in software engineering. Of course I
+> wouldn't bother making such optimizations on a one-shot job, but when it's a
+> question of preparing quality programs, I don't want to restrict myself to
+> tools that deny me such efficiencies.
+
+Many people will say "let's write down the code in as simple a way as possible
+and deal with performance later when we can profile". However, this approach is
+often wrong:
+
+1.  If you disregard all performance concerns when developing a large system,
+    you will end up with a flat profile where there are no obvious hotspots
+    because performance is lost all over the place. It will be difficult to
+    figure out how to get started on performance improvements.
+2.  If you are developing a library that will be used by other people, the
+    people who will run into performance problems will be likely to be people
+    who cannot easily make performance improvements (they will have to
+    understand the details of code written by other people/teams, and have to
+    negotiate with them about the importance of performance optimizations).
+3.  It is harder to make significant changes to a system when it is in heavy
+    use.
+4.  It is also hard to tell if there are performance problems that can be solved
+    easily and so we end up with potentially expensive solutions like
+    over-replication or severe overprovisioning of a service to handle load
+    problems.
+
+Instead, we suggest that when writing code, try to choose the faster alternative
+if it does not impact readability/complexity of the code significantly.
+
+
+
+## Estimation
+
+If you can develop an intuition for how much performance might matter in the
+code you are writing, you can make a more informed decision (e.g., how much
+extra complexity is warranted in the name of performance). Some tips on
+estimating performance while you are writing code:
+
+*   Is it test code? If so, you need to worry mostly about the asymptotic
+    complexity of your algorithms and data structures. (Aside: development cycle
+    time matters, so avoid writing tests that take a long time to run.)
+*   Is it code specific to an application? If so, try to figure out how much
+    performance matters for this piece of code. This is typically not very hard:
+    just figuring out whether code is initialization/setup code vs. code that
+    will end up on hot paths (e.g., processing every request in a service) is
+    often sufficient 
+*   Is it library code that will be used by many applications? In this case it
+    is hard to tell how sensitive it might become. This is where it becomes
+    especially important to follow some of the simple techniques described in
+    this document. For example, if you need to store a vector that usually has a
+    small number of elements, use an absl::InlinedVector instead of std::vector.
+    Such techniques are not very hard to follow and don't add any non-local
+    complexity to the system. And if it turns out that the code you are writing
+    does end up using significant resources, it will be higher performance from
+    the start. And it will be easier to find the next thing to focus on when
+    looking at a profile.
+
+You can do a slightly deeper analysis when picking between options with
+potentially different performance characteristics by relying on
+[back of the envelope calculations](https://en.wikipedia.org/wiki/Back-of-the-envelope_calculation).
+Such calculations can quickly give a very rough estimate of the performance of
+different alternatives, and the results can be used to discard some of the
+alternatives without having to implement them.
+
+Here is how such an estimation might work:
+
+1.  Estimate how many low-level operations of various kinds are required, e.g.,
+    number of disk seeks, number of network round-trips, bytes transmitted etc.
+2.  Multiply each kind of expensive operation with its rough cost, and add the
+    results together.
+3.  The preceding gives the *cost* of the system in terms of resource usage. If
+    you are interested in latency, and if the system has any concurrency, some
+    of the costs may overlap and you may have to do slightly more complicated
+    analysis to estimate the latency.
+
+The following table, which is an updated version of a table from a
+[2007 talk at Stanford University](https://static.googleusercontent.com/media/research.google.com/en//people/jeff/stanford-295-talk.pdf)
+(video of the 2007 talk no longer exists, but there is a
+[video of a related 2011 Stanford talk that covers some of the same content](https://www.youtube.com/watch?v=modXC5IWTJI))
+may be useful since it lists the types of operations to consider, and their
+rough cost:
+
+```
+L1 cache reference                             0.5 ns
+L2 cache reference                             3 ns
+Branch mispredict                              5 ns
+Mutex lock/unlock (uncontended)               15 ns
+Main memory reference                         50 ns
+Compress 1K bytes with Snappy              1,000 ns
+Read 4KB from SSD                         20,000 ns
+Round trip within same datacenter         50,000 ns
+Read 1MB sequentially from memory         64,000 ns
+Read 1MB over 100 Gbps network           100,000 ns
+Read 1MB from SSD                      1,000,000 ns
+Disk seek                              5,000,000 ns
+Read 1MB sequentially from disk       10,000,000 ns
+Send packet CA->Netherlands->CA      150,000,000 ns
+```
+
+The preceding table contains rough costs for some basic low-level operations.
+You may find it useful to also track estimated costs for higher-level operations
+relevant to your system. E.g., you might want to know the rough cost of a point
+read from your SQL database, the latency of interacting with a Cloud service, or
+the time to render a simple HTML page. If you don’t know the relevant cost of
+different operations, you can’t do decent back-of-the-envelope calculations!
+
+### Example: Time to quicksort a billion 4 byte numbers
+
+As a rough approximation, a good quicksort algorithm makes log(N) passes over an
+array of size N. On each pass, the array contents will be streamed from memory
+into the processor cache, and the partition code will compare each element once
+to a pivot element. Let's add up the dominant costs:
+
+1.  Memory bandwidth: the array occupies 4 GB (4 bytes per number times a
+    billion numbers). Let's assume ~16GB/s of memory bandwidth per core. That
+    means each pass will take ~0.25s. N is ~2^30, so we will make ~30 passes, so
+    the total cost of memory transfer will be ~7.5 seconds.
+2.  Branch mispredictions: we will do a total of N*log(N) comparisons, i.e., ~30
+    billion comparisons. Let's assume that half of them (i.e., 15 billion) are
+    mispredicted. Multiplying by 5 ns per misprediction, we get a misprediction
+    cost of 75 seconds. We assume for this analysis that correctly predicted
+    branches are free.
+3.  Adding up the previous numbers, we get an estimate of ~82.5 seconds.
+
+If necessary, we could refine our analysis to account for processor caches. This
+refinement is probably not needed since branch mispredictions are the dominant
+cost according to the analysis above, but we include it here anyway as another
+example. Let's assume we have a 32MB L3 cache, and that the cost of transferring
+data from L3 cache to the processor is negligible. The L3 cache can hold 2^23
+numbers, and therefore the last 22 passes can operate on the data resident in
+the L3 cache (the 23rd last pass brings data into the L3 cache and the remaining
+passes operate on that data.) That cuts down the memory transfer cost to 2.5
+seconds (10 memory transfers of 4GB at 16GB/s) instead of 7.5 seconds (30 memory
+transfers).
+
+### Example: Time to generate a web page with 30 image thumbnails
+
+Let's compare two potential designs where the original images are stored on
+disk, and each image is approximately 1MB in size.
+
+1.  Read the contents of the 30 images serially and generate a thumbnail for
+    each one. Each read takes one seek + one transfer, which adds up to 5ms for
+    the seek, and 10ms for the transfer, which adds up to 30 images times 15ms
+    per image, i.e., 450ms.
+2.  Read in parallel, assuming the images are spread evenly across K disks. The
+    previous resource usage estimate still holds, but latency will drop by
+    roughly a factor of K, ignoring variance (e.g, we will sometimes get unlucky
+    and one disk will have more than 1/Kth of the images we are reading).
+    Therefore if we are running on a distributed filesystem with hundreds of
+    disks, the expected latency will drop to ~15ms.
+3.  Let's consider a variant where all images are on a single SSD. This changes
+    the sequential read performance to 20µs + 1ms per image, which adds up to
+    ~30 ms overall.
+
+## Measurement {#measurement}
+
+The preceding section gives some tips about how to think about performance when
+writing code without worrying too much about how to measure the performance
+impact of your choices. However, before you actually start making improvements,
+or run into a tradeoff involving various things like performance, simplicity,
+etc. you will want to measure or estimate potential performance benefits. Being
+able to measure things effectively is the number one tool you'll want to have in
+your arsenal when doing performance-related work.
+
+As an aside, it’s worth pointing out that profiling code that you’re unfamiliar
+with can also be a good way of getting a general sense of the structure of the
+codebase and how it operates. Examining the source code of heavily involved
+routines in the dynamic call graph of a program can give you a high level sense
+of “what happens” when running the code, which can then build your own
+confidence in making performance-improving changes in slightly unfamiliar code.
+
+### Profiling tools and tips {#profiling-tools-and-tips}
+
+
+Many useful profiling tools are available. A useful tool to reach for first is
+[pprof](https://github.com/google/pprof/blob/main/doc/README.md) since it gives
+good high level performance information and is easy to use both locally and for
+code running in production. Also try
+[perf](https://perf.wiki.kernel.org/index.php/Main_Page) if you want more
+detailed insight into performance.
+
+
+
+Some tips for profiling:
+
+*   Build production binaries with appropriate debugging information and
+    optimization flags.
+*   If you can, write a [microbenchmark][fast75] that covers the code you are
+    improving. Microbenchmarks improve turnaround time when making performance
+    improvements, help verify the impact of performance improvements, and can
+    help prevent future performance regressions. However microbenchmarks can
+    have [pitfalls][fast39] that make them non-representative of full system
+    performance. Useful libraries for writing microbenchmarks:
+    [C++][cpp benchmarks] [Go][go benchmarks] [Java][jmh].
+*   Use a benchmark library to [emit performance counter readings][fast53] both
+    for better precision, and to get more insight into program behavior.
+
+*   Lock contention can often artificially lower CPU usage. Some mutex
+    implementations provide support for profiling lock contention. 
+*   Use [ML profilers][xprof] for machine learning performance work .
+
+### What to do when profiles are flat {#what-to-do-when-profiles-are-flat}
+
+You will often run into situations where your CPU profile is flat (there is no
+obvious big contributor to slowness). This can often happen when all low-hanging
+fruit has been picked. Here are some tips to consider if you find yourself in
+this situation:
+
+*   Don't discount the value of many small optimizations! Making twenty separate
+    1% improvements in some subsystem is often eminently possible and
+    collectively mean a pretty sizable improvement (work of this flavor often
+    relies on having stable and high quality microbenchmarks). Some examples of
+    these sorts of changes are in the
+    [changes that demonstrate multiple techniques](#cls-that-demonstrate-multiple-techniques)
+    section.
+*   Find loops closer to the top of call stacks (flame graph view of a CPU
+    profile can be helpful here). Potentially, the loop or the code it calls
+    could be restructured to be more efficient. Some code that initially built a
+    complicated graph structure incrementally by looping over nodes and edges of
+    the input was changed to build the graph structure in one shot by passing it
+    the entire input. This removed a bunch of internal checks that were
+    happening per edge in the initial code.
+*   Take a step back and look for structural changes higher up in the call
+    stacks instead of concentrating on micro-optimizations. The techniques
+    listed under [algorithmic improvements](#algorithmic-improvements) can be
+    useful when doing this.
+*   Look for overly general code. Replace it with a customized or lower-level
+    implementation. E.g., if an application is repeatedly using a regular
+    expression match where a simple prefix match would suffice, consider
+    dropping the use of the regular expression.
+*   Attempt to reduce the number of allocations:
+    [get an allocation profile][profile sources], and pick away at the highest
+    contributor to the number of allocations. This will have two effects: (1) It
+    will provide a direct reduction of the amount of time spent in the allocator
+    (and garbage collector for GC-ed languages) (2) There will often be a
+    reduction in cache misses since in a long running program using tcmalloc,
+    every allocation tends to go to a different cache line.
+*   Gather other types of profiles, specially ones based on hardware performance
+    counters. Such profiles may point out functions that are encountering a high
+    cache miss rate. Techniques described in the
+    [profiling tools and tips](#profiling-tools-and-tips) section can be
+    helpful.
+
+
+
+## API considerations {#api-considerations}
+
+Some of the techniques suggested below require changing data structures and
+function signatures, which may be disruptive to callers. Try to organize code so
+that the suggested performance improvements can be made inside an encapsulation
+boundary without affecting public interfaces. This will be easier if your
+[modules are deep](https://web.stanford.edu/~ouster/cgi-bin/book.php)
+(significant functionality accessed via a narrow interface).
+
+Widely used APIs come under heavy pressure to add features. Be
+careful when adding new features since these will constrain future
+implementations and increase cost unnecessarily for users who don't need the new
+features. E.g., many C++ standard library containers promise iterator stability,
+which in typical implementations increases the number of allocations
+significantly, even though many users do not need pointer stability.
+
+Some specific techniques are listed below. Consider carefully the performance
+benefits vs. any API usability issues introduced by such changes.
+
+### Bulk APIs
+
+Provide bulk ops to reduce expensive API boundary crossings or to take advantage
+of algorithmic improvements.
+
+
+<details class="zippy">
+<summary><p>Add bulk MemoryManager::LookupMany interface.</p>
+</summary>
+
+<p>In addition to adding a bulk interface, this also simplified the signature for
+the new bulk variant: it turns out clients only needed to know if all the keys
+were found, so we can return a bool rather than a Status object.</p>
+<p>memory_manager.h</p>
+
+<div class="old">
+<pre><code class="language-c++">class MemoryManager {
+ public:
+  ...
+  util::StatusOr&lt;LiveTensor&gt; Lookup(const TensorIdProto&amp; id);
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">class MemoryManager {
+ public:
+  ...
+  util::StatusOr&lt;LiveTensor&gt; Lookup(const TensorIdProto&amp; id);
+
+  // Lookup the identified tensors
+  struct LookupKey {
+    ClientHandle client;
+    uint64 local_id;
+  };
+  bool LookupMany(absl::Span&lt;const LookupKey&gt; keys,
+                  absl::Span&lt;tensorflow::Tensor&gt; tensors);
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Add bulk ObjectStore::DeleteRefs API to amortize locking
+overhead.</p>
+</summary>
+
+<p>object_store.h</p>
+
+<div class="old">
+<pre><code class="language-c++">template &lt;typename T&gt;
+class ObjectStore {
+ public:
+  ...
+  absl::Status DeleteRef(Ref);
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">template &lt;typename T&gt;
+class ObjectStore {
+ public:
+  ...
+  absl::Status DeleteRef(Ref);
+
+  // Delete many references.  For each ref, if no other Refs point to the same
+  // object, the object will be deleted.  Returns non-OK on any error.
+  absl::Status DeleteRefs(absl::Span&lt;const Ref&gt; refs);
+  ...
+template &lt;typename T&gt;
+absl::Status ObjectStore&lt;T&gt;::DeleteRefs(absl::Span&lt;const Ref&gt; refs) {
+  util::Status result;
+  absl::MutexLock l(&amp;mu_);
+  for (auto ref : refs) {
+    result.Update(DeleteRefLocked(ref));
+  }
+  return result;
+}
+</code></pre>
+</div>
+
+<p>memory_tracking.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">void HandleBatch(int, const plaque::Batch&amp; input) override {
+  for (const auto&amp; t : input) {
+    auto in = In(t);
+    PLAQUE_OP_ASSIGN_OR_RETURN(const auto&amp; handles, in.handles());
+    for (const auto handle : handles.value-&gt;handles()) {
+      PLAQUE_OP_RETURN_IF_ERROR(in_buffer_store_
+                                    ? bstore_-&gt;DeleteRef(handle)
+                                    : tstore_-&gt;DeleteRef(handle));
+    }
+  }
+}
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">void HandleBatch(int, const plaque::Batch&amp; input) override {
+  for (const auto&amp; t : input) {
+    auto in = In(t);
+    PLAQUE_OP_ASSIGN_OR_RETURN(const auto&amp; handles, in.handles());
+    if (in_buffer_store_) {
+      PLAQUE_OP_RETURN_IF_ERROR(
+          bstore_-&gt;DeleteRefs(handles.value-&gt;handles()));
+    } else {
+      PLAQUE_OP_RETURN_IF_ERROR(
+          tstore_-&gt;DeleteRefs(handles.value-&gt;handles()));
+    }
+  }
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Use <a href="https://en.wikipedia.org/wiki/Heapsort#Variations">Floyd's
+heap construction</a> for efficient initialization.</p>
+</summary>
+
+<p>Bulk initialization of a heap can be done in O(N) time, whereas adding one
+element at a time and updating the heap property after each addition requires
+O(N lg(N)) time.</p>
+
+</details>
+
+
+Sometimes it is hard to change callers to use a new bulk API directly. In that
+case it might be beneficial to use a bulk API internally and cache the results
+for use in future non-bulk API calls:
+
+
+<details class="zippy">
+<summary><p>Cache block decode results for use in future calls.</p>
+</summary>
+
+<p>Each lookup needs to decode a whole block of K entries. Store the decoded
+entries in a cache and consult the cache on future lookups.</p>
+<p>lexicon.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">void GetTokenString(int pos, std::string* out) const {
+  ...
+  absl::FixedArray&lt;LexiconEntry, 32&gt; entries(pos + 1);
+
+  // Decode all lexicon entries up to and including pos.
+  for (int i = 0; i &lt;= pos; ++i) {
+    p = util::coding::TwoValuesVarint::Decode32(p, &amp;entries[i].remaining,
+                                                &amp;entries[i].shared);
+    entries[i].remaining_str = p;
+    p += entries[i].remaining;  // remaining bytes trail each entry.
+  }
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">mutable std::vector&lt;absl::InlinedVector&lt;std::string, 16&gt;&gt; cache_;
+...
+void GetTokenString(int pos, std::string* out) const {
+  ...
+  DCHECK_LT(skentry, cache_.size());
+  if (!cache_[skentry].empty()) {
+    *out = cache_[skentry][pos];
+    return;
+  }
+  ...
+  // Init cache.
+  ...
+  const char* prev = p;
+  for (int i = 0; i &lt; block_sz; ++i) {
+    uint32 shared, remaining;
+    p = TwoValuesVarint::Decode32(p, &amp;remaining, &amp;shared);
+    auto&amp; cur = cache_[skentry].emplace_back();
+    gtl::STLStringResizeUninitialized(&amp;cur, remaining + shared);
+
+    std::memcpy(cur.data(), prev, shared);
+    std::memcpy(cur.data() + shared, p, remaining);
+    prev = cur.data();
+    p += remaining;
+  }
+  *out = cache_[skentry][pos];
+</code></pre>
+</div>
+
+
+</details>
+
+
+### View types
+
+Prefer view types (e.g., `std::string_view`, `std::Span<T>`,
+`absl::FunctionRef<R(Args...)>`) for function arguments (unless ownership of the
+data is being transferred). These types reduce copying, and allow callers to
+pick their own container types (e.g., one caller might use `std::vector` whereas
+another one uses `absl::InlinedVector`).
+
+### Pre-allocated/pre-computed arguments
+
+For frequently called routines, sometimes it is useful to allow higher-level
+callers to pass in a data structure that they own or information that the called
+routine needs that the client already has. This can avoid the low-level routine
+being forced to allocate its own temporary data structure or recompute
+already-available information.
+
+
+
+
+<details class="zippy">
+<summary><p>Add RPC_Stats::RecordRPC variant allowing client to pass in
+already available WallTime value.</p>
+</summary>
+
+<p>rpc-stats.h</p>
+
+<div class="old">
+<pre><code class="language-c++">static void RecordRPC(const Name &amp;name, const RPC_Stats_Measurement&amp; m);
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">static void RecordRPC(const Name &amp;name, const RPC_Stats_Measurement&amp; m,
+                      WallTime now);
+</code></pre>
+</div>
+
+<p>clientchannel.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">const WallTime now = WallTime_Now();
+...
+RPC_Stats::RecordRPC(stats_name, m);
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">const WallTime now = WallTime_Now();
+...
+RPC_Stats::RecordRPC(stats_name, m, now);
+</code></pre>
+</div>
+
+
+</details>
+
+
+### Thread-compatible vs. Thread-safe types {#thread-compatible-vs-thread-safe-types}
+
+A type may be either thread-compatible (synchronized externally) or thread-safe
+(synchronized internally). Most generally used types should be
+thread-compatible. This way callers who do not need thread-safety don't pay for
+it.
+
+
+<details class="zippy">
+<summary><p>Make a class thread-compatible since callers are already
+synchronized.</p>
+</summary>
+
+<p>hitless-transfer-phase.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">TransferPhase HitlessTransferPhase::get() const {
+  static CallsiteMetrics cm(&quot;HitlessTransferPhase::get&quot;);
+  MonitoredMutexLock l(&amp;cm, &amp;mutex_);
+  return phase_;
+}
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">TransferPhase HitlessTransferPhase::get() const { return phase_; }
+</code></pre>
+</div>
+
+<p>hitless-transfer-phase.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">bool HitlessTransferPhase::AllowAllocate() const {
+  static CallsiteMetrics cm(&quot;HitlessTransferPhase::AllowAllocate&quot;);
+  MonitoredMutexLock l(&amp;cm, &amp;mutex_);
+  return phase_ == TransferPhase::kNormal || phase_ == TransferPhase::kBrownout;
+}
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">bool HitlessTransferPhase::AllowAllocate() const {
+  return phase_ == TransferPhase::kNormal || phase_ == TransferPhase::kBrownout;
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+However if the typical use of a type needs synchronization, prefer to move the
+synchronization inside the type. This allows the synchronization mechanism to be
+tweaked as necessary to improve performance (e.g., sharding to reduce
+contention) without affecting callers.
+
+## Algorithmic improvements {#algorithmic-improvements}
+
+The most critical opportunities for performance improvements come from
+algorithmic improvements, e.g., turning an O(N²) algorithm to O(N lg(N)) or
+O(N), avoiding potentially exponential behavior, etc. These opportunities are
+rare in stable code, but are worth paying attention to when writing new code. A
+few examples that show such improvements to pre-existing code:
+
+
+<details class="zippy">
+<summary><p>Add nodes to cycle detection structure in reverse
+post-order.</p>
+</summary>
+
+<p>We were previously adding graph nodes and edges one at a time to a
+cycle-detection data structure, which required expensive work per edge. We now
+add the entire graph in reverse post-order, which makes cycle-detection trivial.</p>
+<p>graphcycles.h</p>
+
+<div class="old">
+<pre><code class="language-c++">class GraphCycles : public util_graph::Graph {
+ public:
+  GraphCycles();
+  ~GraphCycles() override;
+
+  using Node = util_graph::Node;
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">class GraphCycles : public util_graph::Graph {
+ public:
+  GraphCycles();
+  ~GraphCycles() override;
+
+  using Node = util_graph::Node;
+
+  // InitFrom adds all the nodes and edges from src, returning true if
+  // successful, false if a cycle is encountered.
+  // REQUIRES: no nodes and edges have been added to GraphCycles yet.
+  bool InitFrom(const util_graph::Graph&amp; src);
+</code></pre>
+</div>
+
+<p>graphcycles.cc</p>
+
+<div class="new">
+<pre><code class="language-c++">bool GraphCycles::InitFrom(const util_graph::Graph&amp; src) {
+  ...
+  // Assign ranks in topological order so we don't need any reordering during
+  // initialization. For an acyclic graph, DFS leaves nodes in reverse
+  // topological order, so we assign decreasing ranks to nodes as we leave them.
+  Rank last_rank = n;
+  auto leave = [&amp;](util_graph::Node node) {
+    DCHECK(r-&gt;rank[node] == kMissingNodeRank);
+    NodeInfo* nn = &amp;r-&gt;nodes[node];
+    nn-&gt;in = kNil;
+    nn-&gt;out = kNil;
+    r-&gt;rank[node] = --last_rank;
+  };
+  util_graph::DFSAll(src, std::nullopt, leave);
+
+  // Add all the edges (detect cycles as we go).
+  bool have_cycle = false;
+  util_graph::PerEdge(src, [&amp;](util_graph::Edge e) {
+    DCHECK_NE(r-&gt;rank[e.src], kMissingNodeRank);
+    DCHECK_NE(r-&gt;rank[e.dst], kMissingNodeRank);
+    if (r-&gt;rank[e.src] &gt;= r-&gt;rank[e.dst]) {
+      have_cycle = true;
+    } else if (!HasEdge(e.src, e.dst)) {
+      EdgeListAddNode(r, &amp;r-&gt;nodes[e.src].out, e.dst);
+      EdgeListAddNode(r, &amp;r-&gt;nodes[e.dst].in, e.src);
+    }
+  });
+  if (have_cycle) {
+    return false;
+  } else {
+    DCHECK(CheckInvariants());
+    return true;
+  }
+}
+</code></pre>
+</div>
+
+<p>graph_partitioner.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">absl::Status MergeGraph::Init() {
+  const Graph&amp; graph = *compiler_-&gt;graph();
+  clusters_.resize(graph.NodeLimit());
+  graph.PerNode([&amp;](Node node) {
+    graph_-&gt;AddNode(node);
+    NodeList* n = new NodeList;
+    n-&gt;push_back(node);
+    clusters_[node] = n;
+  });
+  absl::Status s;
+  PerEdge(graph, [&amp;](Edge e) {
+    if (!s.ok()) return;
+    if (graph_-&gt;HasEdge(e.src, e.dst)) return;  // already added
+    if (!graph_-&gt;InsertEdge(e.src, e.dst)) {
+      s = absl::InvalidArgumentError(&quot;cycle in the original graph&quot;);
+    }
+  });
+  return s;
+}
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">absl::Status MergeGraph::Init() {
+  const Graph&amp; graph = *compiler_-&gt;graph();
+  if (!graph_-&gt;InitFrom(graph)) {
+    return absl::InvalidArgumentError(&quot;cycle in the original graph&quot;);
+  }
+  clusters_.resize(graph.NodeLimit());
+  graph.PerNode([&amp;](Node node) {
+    NodeList* n = new NodeList;
+    n-&gt;push_back(node);
+    clusters_[node] = n;
+  });
+  return absl::OkStatus();
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Replace the deadlock detection system built into a mutex
+implementation with a better algorithm.</p>
+</summary>
+
+<p>Replaced deadlock detection algorithm by one that is ~50x as fast and scales to
+millions of mutexes without problem (the old algorithm relied on a 2K limit to
+avoid a performance cliff). The new code is based on the following paper: A
+dynamic topological sort algorithm for directed acyclic graphs David J. Pearce,
+Paul H. J. Kelly Journal of Experimental Algorithmics (JEA) JEA Homepage archive
+Volume 11, 2006, Article No. 1.7</p>
+<p>The new algorithm takes O(|V|+|E|) space (instead of the O(|V|^2) bits needed by
+the older algorithm). Lock-acquisition order graphs are very sparse, so this is
+much less space. The algorithm is also quite simple: the core of it is ~100
+lines of C++. Since the code now scales to much larger number of Mutexes, we
+were able to relax an artificial 2K limit, which uncovered a number of latent
+deadlocks in real programs.</p>
+<p>Benchmark results: these were run in DEBUG mode since deadlock detection is
+mainly enabled in debug mode. The benchmark argument (/2k etc.) is the number of
+tracked nodes. At the default 2k limit of the old algorithm, the new algorithm
+takes only 0.5 microseconds per InsertEdge compared to 22 microseconds for the
+old algorithm. The new algorithm also easily scales to much larger graphs
+without problems whereas the old algorithm keels over quickly.</p>
+<pre><code class="language-{.old}">DEBUG: Benchmark            Time(ns)    CPU(ns) Iterations
+----------------------------------------------------------
+DEBUG: BM_StressTest/2k        23553      23566      29086
+DEBUG: BM_StressTest/4k        45879      45909      15287
+DEBUG: BM_StressTest/16k      776938     777472        817
+</code></pre>
+<pre><code class="language-{.new}">DEBUG: BM_StressTest/2k          392        393   10485760
+DEBUG: BM_StressTest/4k          392        393   10485760
+DEBUG: BM_StressTest/32k         407        407   10485760
+DEBUG: BM_StressTest/256k        456        456   10485760
+DEBUG: BM_StressTest/1M          534        534   10485760
+</code></pre>
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Replace an IntervalMap (with O(lg N) lookups) with a hash
+table (O(1) lookups).</p>
+</summary>
+
+<p>The initial code was using IntervalMap because it seemed like the right data
+structure to support coalescing of adjacent blocks, but a hash table suffices
+since the adjacent block can be found by a hash table lookup. This (plus other
+changes in the CL) improve the performance of tpu::BestFitAllocator by ~4X.</p>
+<p>best_fit_allocator.h</p>
+
+<div class="old">
+<pre><code class="language-c++">using Block = gtl::IntervalMap&lt;int64, BlockState&gt;::Entry;
+...
+// Map of pairs (address range, BlockState) with one entry for each allocation
+// covering the range [0, allocatable_range_end_).  Adjacent kFree and
+// kReserved blocks are coalesced. Adjacent kAllocated blocks are not
+// coalesced.
+gtl::IntervalMap&lt;int64, BlockState&gt; block_list_;
+
+// Set of all free blocks sorted according to the allocation policy. Adjacent
+// free blocks are coalesced.
+std::set&lt;Block, BlockSelector&gt; free_list_;
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">// A faster hash function for offsets in the BlockTable
+struct OffsetHash {
+  ABSL_ATTRIBUTE_ALWAYS_INLINE size_t operator()(int64 value) const {
+    uint64 m = value;
+    m *= uint64_t{0x9ddfea08eb382d69};
+    return static_cast&lt;uint64_t&gt;(m ^ (m &gt;&gt; 32));
+  }
+};
+
+// Hash table maps from block start address to block info.
+// We include the length of the previous block in this info so we
+// can find the preceding block to coalesce with.
+struct HashTableEntry {
+  BlockState state;
+  int64 my_length;
+  int64 prev_length;  // Zero if there is no previous block.
+};
+using BlockTable = absl::flat_hash_map&lt;int64, HashTableEntry, OffsetHash&gt;;
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Replace sorted-list intersection (O(N log N)) with hash
+table lookups (O(N)).</p>
+</summary>
+
+<p>Old code to detect whether or not two nodes share a common source would get the
+sources for each node in sorted order and then do a sorted intersection. The new
+code places the sources for one node in a hash-table and then iterates over the
+other node's sources checking the hash-table.</p>
+<pre><code class="language-{.bench}">name             old time/op  new time/op  delta
+BM_CompileLarge   28.5s ± 2%   22.4s ± 2%  -21.61%  (p=0.008 n=5+5)
+</code></pre>
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Implement good hash function so that things are O(1)
+instead of O(N).</p>
+</summary>
+
+<p>location.h</p>
+
+<div class="old">
+<pre><code class="language-c++">// Hasher for Location objects.
+struct LocationHash {
+  size_t operator()(const Location* key) const {
+    return key != nullptr ? util_hash::Hash(key-&gt;address()) : 0;
+  }
+};
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">size_t HashLocation(const Location&amp; loc);
+...
+struct LocationHash {
+  size_t operator()(const Location* key) const {
+    return key != nullptr ? HashLocation(*key) : 0;
+  }
+};
+</code></pre>
+</div>
+
+<p>location.cc</p>
+
+<div class="new">
+<pre><code class="language-c++">size_t HashLocation(const Location&amp; loc) {
+  util_hash::MurmurCat m;
+
+  // Encode some simpler features into a single value.
+  m.AppendAligned((loc.dynamic() ? 1 : 0)                    //
+                  | (loc.append_shard_to_address() ? 2 : 0)  //
+                  | (loc.is_any() ? 4 : 0)                   //
+                  | (!loc.any_of().empty() ? 8 : 0)          //
+                  | (loc.has_shardmap() ? 16 : 0)            //
+                  | (loc.has_sharding() ? 32 : 0));
+
+  if (loc.has_shardmap()) {
+    m.AppendAligned(loc.shardmap().output() |
+                    static_cast&lt;uint64_t&gt;(loc.shardmap().stmt()) &lt;&lt; 20);
+  }
+  if (loc.has_sharding()) {
+    uint64_t num = 0;
+    switch (loc.sharding().type_case()) {
+      case Sharding::kModShard:
+        num = loc.sharding().mod_shard();
+        break;
+      case Sharding::kRangeSplit:
+        num = loc.sharding().range_split();
+        break;
+      case Sharding::kNumShards:
+        num = loc.sharding().num_shards();
+        break;
+      default:
+        num = 0;
+        break;
+    }
+    m.AppendAligned(static_cast&lt;uint64_t&gt;(loc.sharding().type_case()) |
+                    (num &lt;&lt; 3));
+  }
+
+  auto add_string = [&amp;m](absl::string_view s) {
+    if (!s.empty()) {
+      m.Append(s.data(), s.size());
+    }
+  };
+
+  add_string(loc.address());
+  add_string(loc.lb_policy());
+
+  // We do not include any_of since it is complicated to compute a hash
+  // value that is not sensitive to order and duplication.
+  return m.GetHash();
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+
+## Better memory representation {#better-memory-representation}
+
+Careful consideration of memory footprint and cache footprint of important data
+structures can often yield big savings. The data structures below focus on
+supporting common operations by touching fewer cache lines. Care taken here can
+(a) avoid expensive cache misses (b) reduce memory bus traffic, which speeds up
+both the program in question and anything else running on the same machine. They
+rely on some common techniques you may find useful when designing your own data
+structures.
+
+### Compact data structures
+
+Use compact representations for data that will be accessed often or that
+comprises a large portion of the application's memory usage. A compact
+representation can significantly reduce memory usage and improve performance by
+touching fewer cache lines and reducing memory bus bandwidth usage. However,
+watch out for [cache-line contention](#reduce-false-sharing).
+
+### Memory layout {#memory-layout}
+
+Carefully consider the memory layout of types that have a large memory or cache
+footprint.
+
+*   Reorder fields to reduce padding between fields with different alignment
+    requirements (see
+    [class layout discussion](https://stackoverflow.com/questions/9989164/optimizing-memory-layout-of-class-instances-in-c)).
+*   Use smaller numeric types where the stored data will fit in the smaller
+    type.
+*   Enum values sometimes take up a whole word unless you're careful. Consider
+    using a smaller representation (e.g., use `enum class OpType : uint8_t { ...
+    }` instead of `enum class OpType { ... }`).
+*   Order fields so that fields that are frequently accessed together are closer
+    to each other – this will reduce the number of cache lines touched on common
+    operations.
+*   Place hot read-only fields away from hot mutable fields so that writes to
+    the mutable fields do not cause the read-only fields to be evicted from
+    nearby caches.
+*   Move cold data so it does not live next to hot data, either by placing the
+    cold data at the end of the struct, or behind a level of indirection, or in
+    a separate array.
+*   Consider packing things into fewer bytes by using bit and byte-level
+    encoding. This can be complicated, so only do this when the data under
+    question is encapsulated inside a well-tested module, and the overall
+    reduction of memory usage is significant. Furthermore, watch out for side
+    effects like under-alignment of frequently used data, or more expensive code
+    for accessing packed representations. Validate such changes using
+    benchmarks.
+
+
+
+### Indices instead of pointers {#indices-instead-of-pointers}
+
+On modern 64-bit machines, pointers take up 64 bits. If you have a pointer-rich
+data structure, you can easily chew up lots of memory with indirections of T\*.
+Instead, consider using integer indices into an array T[] or other data
+structure. Not only will the references be smaller (if the number of indices is
+small enough to fit in 32 or fewer bits), but the storage for all the T[]
+elements will be contiguous, often leading to better cache locality.
+
+### Batched storage
+
+Avoid data structures that allocate a separate object per stored element (e.g.,
+`std::map`, `std::unordered_map` in C++). Instead, consider types that use
+chunked or flat representations to store multiple elements in close proximity in
+memory (e.g., `std::vector`, `absl::flat_hash_{map,set}` in C++). Such types
+tend to have much better cache behavior. Furthermore, they encounter less
+allocator overhead.
+
+One useful technique is to partition elements into chunks where each chunk can
+hold a fixed number of elements. This technique can reduce the cache footprint
+of a data structure significantly while preserving good asymptotic behavior.
+
+For some data structures, a single chunk suffices to hold all elements (e.g.,
+strings and vectors). Other types (e.g., `absl::flat_hash_map`) also use this
+technique.
+
+### Inlined storage {#inlined-storage}
+
+Some container types are optimized for storing a small number of elements. These
+types provide space for a small number of elements at the top level and
+completely avoid allocations when the number of elements is small. This can be
+very helpful when instances of such types are constructed often (e.g., as stack
+variables in frequently executed code), or if many instances are live at the
+same time. If a container will typically contain a small number of elements
+consider using one of the inlined storage types, e.g., InlinedVector.
+
+Caveat: if `sizeof(T)` is large, inlined storage containers may not be the best
+choice since the inlined backing store will be large.
+
+### Unnecessarily nested maps
+
+Sometimes a nested map data structure can be replaced with a single-level map
+with a compound key. This can reduce the cost of lookups and insertions
+significantly.
+
+
+<details class="zippy">
+<summary><p>Reduce allocations and improve cache footprint by
+converting btree&lt;a,btree&lt;b,c&gt;&gt; to btree&lt;pair&lt;a,b&gt;,c&gt;.</p>
+</summary>
+
+<p>graph_splitter.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">absl::btree_map&lt;std::string, absl::btree_map&lt;std::string, OpDef&gt;&gt; ops;
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">// The btree maps from {package_name, op_name} to its const Opdef*.
+absl::btree_map&lt;std::pair&lt;absl::string_view, absl::string_view&gt;,
+                const OpDef*&gt;
+    ops;
+</code></pre>
+</div>
+
+
+</details>
+
+
+Caveat: if the first map key is big, it might be better to stick with nested
+maps:
+
+
+<details class="zippy">
+<summary><p>Switch to a nested map leads to 76% performance
+improvement in microbenchmark.</p>
+</summary>
+
+<p>We previously had a single-level hash table where the key consisted of a
+(string) path and some other numeric sub-keys. Each path occurred in
+approximately 1000 keys on average. We split the hash table into two levels
+where the first level was keyed by the path and each second level hash table
+kept just the sub-key to data mapping for a particular path. This reduced the
+memory usage for storing paths by a factor of 1000, and also sped up accesses
+where many sub-keys for the same path were accessed together.</p>
+
+</details>
+
+
+### Arenas {#arenas}
+
+Arenas can help reduce memory allocation cost, but they also have the benefit of
+packing together independently allocated items next to each other, typically in
+fewer cache lines, and eliminating most destruction costs. They are likely most
+effective for complex data structures with many sub-objects. Consider providing
+an appropriate initial size for the arena since that can help reduce
+allocations.
+
+Caveat: it is easy to misuse arenas by putting too many short-lived objects in a
+long-lived arena, which can unnecessarily bloat memory footprint.
+
+### Arrays instead of maps
+
+If the domain of a map can be represented by a small integer or is an enum, or
+if the map will have very few elements, the map can sometimes be replaced by an
+array or a vector of some form.
+
+
+<details class="zippy">
+<summary><p>Use an array instead of flat_map.</p>
+</summary>
+
+<p>rtp_controller.h</p>
+
+<div class="old">
+<pre><code class="language-c++">const gtl::flat_map&lt;int, int&gt; payload_type_to_clock_frequency_;
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">// A map (implemented as a simple array) indexed by payload_type to clock freq
+// for that paylaod type (or 0)
+struct PayloadTypeToClockRateMap {
+  int map[128];
+};
+...
+const PayloadTypeToClockRateMap payload_type_to_clock_frequency_;
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+
+### Bit vectors instead of sets
+
+If the domain of a set can be represented by a small integer, the set can be
+replaced with a bit vector (InlinedBitVector is often a good choice). Set
+operations can also be nicely efficient on these representations using bitwise
+boolean operations (OR for union, AND for intersection, etc.).
+
+
+<details class="zippy">
+<summary><p>Spanner placement system. Replace
+dense_hash_set&lt;ZoneId&gt; with a bit-vector with one bit per zone.</p>
+</summary>
+
+<p>zone_set.h</p>
+
+<div class="old">
+<pre><code class="language-c++">class ZoneSet: public dense_hash_set&lt;ZoneId&gt; {
+ public:
+  ...
+  bool Contains(ZoneId zone) const {
+    return count(zone) &gt; 0;
+  }
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">class ZoneSet {
+  ...
+  // Returns true iff &quot;zone&quot; is contained in the set
+  bool ContainsZone(ZoneId zone) const {
+    return zone &lt; b_.size() &amp;&amp; b_.get_bit(zone);
+  }
+  ...
+ private:
+  int size_;          // Number of zones inserted
+  util::bitmap::InlinedBitVector&lt;256&gt; b_;
+</code></pre>
+</div>
+
+<p>Benchmark results:</p>
+<pre><code class="language-{.bench}">CPU: AMD Opteron (4 cores) dL1:64KB dL2:1024KB
+Benchmark                          Base (ns)  New (ns) Improvement
+------------------------------------------------------------------
+BM_Evaluate/1                            960       676    +29.6%
+BM_Evaluate/2                           1661      1138    +31.5%
+BM_Evaluate/3                           2305      1640    +28.9%
+BM_Evaluate/4                           3053      2135    +30.1%
+BM_Evaluate/5                           3780      2665    +29.5%
+BM_Evaluate/10                          7819      5739    +26.6%
+BM_Evaluate/20                         17922     12338    +31.2%
+BM_Evaluate/40                         36836     26430    +28.2%
+</code></pre>
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Use bit matrix to keep track of reachability properties
+between operands instead of hash table.</p>
+</summary>
+
+<p>hlo_computation.h</p>
+
+<div class="old">
+<pre><code class="language-c++">using TransitiveOperandMap =
+    std::unordered_map&lt;const HloInstruction*,
+                       std::unordered_set&lt;const HloInstruction*&gt;&gt;;
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">class HloComputation::ReachabilityMap {
+  ...
+  // dense id assignment from HloInstruction* to number
+  tensorflow::gtl::FlatMap&lt;const HloInstruction*, int&gt; ids_;
+  // matrix_(a,b) is true iff b is reachable from a
+  tensorflow::core::Bitmap matrix_;
+};
+</code></pre>
+</div>
+
+
+</details>
+
+
+## Reduce allocations {#reduce-allocations}
+
+Memory allocation adds costs:
+
+1.  It increases the time spent in the allocator.
+2.  Newly-allocated objects may require expensive initialization and sometimes
+    corresponding expensive destruction when no longer needed.
+3.  Every allocation tends to be on a new cache line and therefore data spread
+    across many independent allocations will have a larger cache footprint than
+    data spread across fewer allocations.
+
+Garbage-collection runtimes sometimes obviate issue #3 by placing consecutive
+allocations sequentially in memory.
+
+### Avoid unnecessary allocations {#avoid-unnecessary-allocations}
+
+
+
+
+<details class="zippy">
+<summary><p>Reducing allocations increases benchmark throughput by
+21%.</p>
+</summary>
+
+<p>memory_manager.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">LiveTensor::LiveTensor(tf::Tensor t, std::shared_ptr&lt;const DeviceInfo&gt; dinfo,
+                       bool is_batched)
+    : tensor(std::move(t)),
+      device_info(dinfo ? std::move(dinfo) : std::make_shared&lt;DeviceInfo&gt;()),
+      is_batched(is_batched) {
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">static const std::shared_ptr&lt;DeviceInfo&gt;&amp; empty_device_info() {
+  static std::shared_ptr&lt;DeviceInfo&gt;* result =
+      new std::shared_ptr&lt;DeviceInfo&gt;(new DeviceInfo);
+  return *result;
+}
+
+LiveTensor::LiveTensor(tf::Tensor t, std::shared_ptr&lt;const DeviceInfo&gt; dinfo,
+                       bool is_batched)
+    : tensor(std::move(t)), is_batched(is_batched) {
+  if (dinfo) {
+    device_info = std::move(dinfo);
+  } else {
+    device_info = empty_device_info();
+  }
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Use statically-allocated zero vector when possible rather
+than allocating a vector and filling it with zeroes.</p>
+</summary>
+
+<p>embedding_executor_8bit.cc</p>
+<pre><code class="language-c++">// The actual implementation of the EmbeddingLookUpT using template parameters
+// instead of object members to improve the performance.
+template &lt;bool Mean, bool SymmetricInputRange&gt;
+static tensorflow::Status EmbeddingLookUpT(...) {
+    ...
+  std::unique_ptr&lt;tensorflow::quint8[]&gt; zero_data(
+      new tensorflow::quint8[max_embedding_width]);
+  memset(zero_data.get(), 0, sizeof(tensorflow::quint8) * max_embedding_width);
+</code></pre>
+<pre><code class="language-c++">// A size large enough to handle most embedding widths
+static const int kTypicalMaxEmbedding = 256;
+static tensorflow::quint8 static_zero_data[kTypicalMaxEmbedding];  // All zeroes
+...
+// The actual implementation of the EmbeddingLookUpT using template parameters
+// instead of object members to improve the performance.
+template &lt;bool Mean, bool SymmetricInputRange&gt;
+static tensorflow::Status EmbeddingLookUpT(...) {
+    ...
+  std::unique_ptr&lt;tensorflow::quint8[]&gt; zero_data_backing(nullptr);
+
+  // Get a pointer to a memory area with at least
+  // &quot;max_embedding_width&quot; quint8 zero values.
+  tensorflow::quint8* zero_data;
+  if (max_embedding_width &lt;= ARRAYSIZE(static_zero_data)) {
+    // static_zero_data is big enough so we don't need to allocate zero data
+    zero_data = &amp;static_zero_data[0];
+  } else {
+    // static_zero_data is not big enough: we need to allocate zero data
+    zero_data_backing =
+        absl::make_unique&lt;tensorflow::quint8[]&gt;(max_embedding_width);
+    memset(zero_data_backing.get(), 0,
+           sizeof(tensorflow::quint8) * max_embedding_width);
+    zero_data = zero_data_backing.get();
+  }
+</code></pre>
+
+</details>
+
+
+Also, prefer stack allocation over heap allocation when object lifetime is
+bounded by the scope (although be careful with stack frame sizes for large
+objects).
+
+### Resize or reserve containers {#resize-or-reserve-containers}
+
+When the maximum or expected maximum size of a vector (or some other container
+types) is known in advance, pre-size the container's backing store (e.g., using
+`resize` or `reserve` in C++).
+
+
+<details class="zippy">
+<summary><p>Pre-size a vector and fill it in, rather than N push_back
+operations.</p>
+</summary>
+
+<p>indexblockdecoder.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">for (int i = 0; i &lt; ndocs-1; i++) {
+  uint32 delta;
+  ERRORCHECK(b-&gt;GetRice(rice_base, &amp;delta));
+  docs_.push_back(DocId(my_shard_ + (base + delta) * num_shards_));
+  base = base + delta + 1;
+}
+docs_.push_back(last_docid_);
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">docs_.resize(ndocs);
+DocId* docptr = &amp;docs_[0];
+for (int i = 0; i &lt; ndocs-1; i++) {
+  uint32 delta;
+  ERRORCHECK(b.GetRice(rice_base, &amp;delta));
+  *docptr = DocId(my_shard_ + (base + delta) * num_shards_);
+  docptr++;
+  base = base + delta + 1;
+}
+*docptr = last_docid_;
+</code></pre>
+</div>
+
+
+</details>
+
+
+Caveat: Do not use `resize` or `reserve` to grow one element at a time since
+that may lead to quadratic behavior. Also, if element construction is expensive,
+prefer an initial `reserve` call followed by several `push_back` or
+`emplace_back` calls instead of an initial `resize` since that will double the
+number of constructor calls.
+
+### Avoid copying when possible {#avoid-copying-when-possible}
+
+*   Prefer moving to copying data structures when possible.
+*   If lifetime is not an issue, store pointers or indices instead of copies of
+    objects in transient data structures. E.g., if a local map is used to select
+    a set of protos from an incoming list of protos, we can make the map store
+    just pointers to the incoming protos instead of copying potentially deeply
+    nested data. Another common example is sorting a vector of indices rather
+    than sorting a vector of large objects directly since the latter would incur
+    significant copying/moving costs.
+
+
+<details class="zippy">
+<summary><p>Avoid an extra copy when receiving a tensor via gRPC.</p>
+</summary>
+
+<p>A benchmark that sends around 400KB tensors speeds up by ~10-15%:</p>
+<pre><code class="language-{.old}">Benchmark              Time(ns)    CPU(ns) Iterations
+-----------------------------------------------------
+BM_RPC/30/98k_mean    148764691 1369998944       1000
+</code></pre>
+<pre><code class="language-{.new}">Benchmark              Time(ns)    CPU(ns) Iterations
+-----------------------------------------------------
+BM_RPC/30/98k_mean    131595940 1216998084       1000
+</code></pre>
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Move large options structure rather than copying it.</p>
+</summary>
+
+<p>index.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">return search_iterators::DocPLIteratorFactory::Create(opts);
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">return search_iterators::DocPLIteratorFactory::Create(std::move(opts));
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Use std::sort instead of std::stable_sort, which avoids
+an internal copy inside the stable sort implementation.</p>
+</summary>
+
+<p>encoded-vector-hits.h</p>
+
+<div class="old">
+<pre><code class="language-c++">std::stable_sort(hits_.begin(), hits_.end(),
+                 gtl::OrderByField(&amp;HitWithPayloadOffset::docid));
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">struct HitWithPayloadOffset {
+  search_iterators::LocalDocId64 docid;
+  int first_payload_offset;  // offset into the payload vector.
+  int num_payloads;
+
+  bool operator&lt;(const HitWithPayloadOffset&amp; other) const {
+    return (docid &lt; other.docid) ||
+           (docid == other.docid &amp;&amp;
+            first_payload_offset &lt; other.first_payload_offset);
+  }
+};
+    ...
+    std::sort(hits_.begin(), hits_.end());
+</code></pre>
+</div>
+
+
+</details>
+
+
+### Reuse temporary objects
+
+A container or an object declared inside a loop will be recreated on every loop
+iteration. This can lead to expensive construction, destruction, and resizing.
+Hoisting the declaration outside the loop enables reuse and can provide a
+significant performance boost. (Compilers are often unable to do such hoisting
+on their own due to language semantics or their inability to ensure program
+equivalence.)
+
+
+<details class="zippy">
+<summary><p>Hoist variable definition outside of loop iteration.</p>
+</summary>
+
+<p>autofdo_profile_utils.h</p>
+
+<div class="old">
+<pre><code class="language-c++">auto iterator = absl::WrapUnique(sstable-&gt;GetIterator());
+while (!iterator-&gt;done()) {
+  T profile;
+  if (!profile.ParseFromString(iterator-&gt;value_view())) {
+    return absl::InternalError(
+        &quot;Failed to parse mem_block to specified profile type.&quot;);
+  }
+  ...
+  iterator-&gt;Next();
+}
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">auto iterator = absl::WrapUnique(sstable-&gt;GetIterator());
+T profile;
+while (!iterator-&gt;done()) {
+  if (!profile.ParseFromString(iterator-&gt;value_view())) {
+    return absl::InternalError(
+        &quot;Failed to parse mem_block to specified profile type.&quot;);
+  }
+  ...
+  iterator-&gt;Next();
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Define a protobuf variable outside a loop so that its
+allocated storage can be reused across loop iterations.</p>
+</summary>
+
+<p>stats-router.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">for (auto&amp; r : routers_to_update) {
+  ...
+  ResourceRecord record;
+  {
+    MutexLock agg_lock(r.agg-&gt;mutex());
+    r.agg-&gt;AddResourceRecordUsages(measure_indices, &amp;record);
+  }
+  ...
+}
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">ResourceRecord record;
+for (auto&amp; r : routers_to_update) {
+  ...
+  record.Clear();
+  {
+    MutexLock agg_lock(r.agg-&gt;mutex());
+    r.agg-&gt;AddResourceRecordUsages(measure_indices, &amp;record);
+  }
+  ...
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Serialize to same std::string repeatedly.</p>
+</summary>
+
+<p>program_rep.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">std::string DeterministicSerialization(const proto2::Message&amp; m) {
+  std::string result;
+  proto2::io::StringOutputStream sink(&amp;result);
+  proto2::io::CodedOutputStream out(&amp;sink);
+  out.SetSerializationDeterministic(true);
+  m.SerializePartialToCodedStream(&amp;out);
+  return result;
+}
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">absl::string_view DeterministicSerializationTo(const proto2::Message&amp; m,
+                                               std::string* scratch) {
+  scratch-&gt;clear();
+  proto2::io::StringOutputStream sink(scratch);
+  proto2::io::CodedOutputStream out(&amp;sink);
+  out.SetSerializationDeterministic(true);
+  m.SerializePartialToCodedStream(&amp;out);
+  return absl::string_view(*scratch);
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+Caveat: protobuf, string, vector, containers etc. tend to grow to the size of
+the largest value ever stored in them. Therefore reconstructing them
+periodically (e.g., after every N uses) can help reduce memory requirements and
+reinitialization costs.
+
+## Avoid unnecessary work {#avoid-unnecessary-work}
+
+Perhaps one of the most effective categories of improving performance is
+avoiding work you don't have to do. This can take many forms, including creating
+specialized paths through code for common cases that avoid more general
+expensive computation, precomputation, deferring work until it is really needed,
+hoisting work into less-frequently executed pieces of code, and other similar
+approaches. Below are many examples of this general approach, categorized into a
+few representative categories.
+
+### Fast paths for common cases
+
+Often, code is written to cover all cases, but some subset of the cases are much
+simpler and more common than others. E.g., `vector::push_back` usually has
+enough space for the new element, but contains code to resize the underlying
+storage when it does not. Some attention paid to the structure of code can help
+make the common simple case faster without hurting uncommon case performance
+significantly.
+
+
+<details class="zippy">
+<summary><p>Make fast path cover more common cases.</p>
+</summary>
+
+<p>Add handling of trailing single ASCII bytes, rather than only handling multiples
+of four bytes with this routine. This avoids calling the slower generic routine
+for all-ASCII strings that are, for example, 5 bytes.</p>
+<p>utf8statetable.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">// Scan a UTF-8 stringpiece based on state table.
+// Always scan complete UTF-8 characters
+// Set number of bytes scanned. Return reason for exiting
+// OPTIMIZED for case of 7-bit ASCII 0000..007f all valid
+int UTF8GenericScanFastAscii(const UTF8ScanObj* st, absl::string_view str,
+                             int* bytes_consumed) {
+                             ...
+  int exit_reason;
+  do {
+    //  Skip 8 bytes of ASCII at a whack; no endianness issue
+    while ((src_limit - src &gt;= 8) &amp;&amp;
+           (((UNALIGNED_LOAD32(src + 0) | UNALIGNED_LOAD32(src + 4)) &amp;
+             0x80808080) == 0)) {
+      src += 8;
+    }
+    //  Run state table on the rest
+    int rest_consumed;
+    exit_reason = UTF8GenericScan(
+        st, absl::ClippedSubstr(str, src - initial_src), &amp;rest_consumed);
+    src += rest_consumed;
+  } while (exit_reason == kExitDoAgain);
+
+  *bytes_consumed = src - initial_src;
+  return exit_reason;
+}
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">// Scan a UTF-8 stringpiece based on state table.
+// Always scan complete UTF-8 characters
+// Set number of bytes scanned. Return reason for exiting
+// OPTIMIZED for case of 7-bit ASCII 0000..007f all valid
+int UTF8GenericScanFastAscii(const UTF8ScanObj* st, absl::string_view str,
+                             int* bytes_consumed) {
+                             ...
+  int exit_reason = kExitOK;
+  do {
+    //  Skip 8 bytes of ASCII at a whack; no endianness issue
+    while ((src_limit - src &gt;= 8) &amp;&amp;
+           (((UNALIGNED_LOAD32(src + 0) | UNALIGNED_LOAD32(src + 4)) &amp;
+             0x80808080) == 0)) {
+      src += 8;
+    }
+    while (src &lt; src_limit &amp;&amp; Is7BitAscii(*src)) { // Skip ASCII bytes
+      src++;
+    }
+    if (src &lt; src_limit) {
+      //  Run state table on the rest
+      int rest_consumed;
+      exit_reason = UTF8GenericScan(
+          st, absl::ClippedSubstr(str, src - initial_src), &amp;rest_consumed);
+      src += rest_consumed;
+    }
+  } while (exit_reason == kExitDoAgain);
+
+  *bytes_consumed = src - initial_src;
+  return exit_reason;
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Simpler fast paths for InlinedVector.</p>
+</summary>
+
+<p>inlined_vector.h</p>
+
+<div class="old">
+<pre><code class="language-c++">auto Storage&lt;T, N, A&gt;::Resize(ValueAdapter values, size_type new_size) -&gt; void {
+  StorageView storage_view = MakeStorageView();
+
+  IteratorValueAdapter&lt;MoveIterator&gt; move_values(
+      MoveIterator(storage_view.data));
+
+  AllocationTransaction allocation_tx(GetAllocPtr());
+  ConstructionTransaction construction_tx(GetAllocPtr());
+
+  absl::Span&lt;value_type&gt; construct_loop;
+  absl::Span&lt;value_type&gt; move_construct_loop;
+  absl::Span&lt;value_type&gt; destroy_loop;
+
+  if (new_size &gt; storage_view.capacity) {
+  ...
+  } else if (new_size &gt; storage_view.size) {
+    construct_loop = {storage_view.data + storage_view.size,
+                      new_size - storage_view.size};
+  } else {
+    destroy_loop = {storage_view.data + new_size, storage_view.size - new_size};
+  }
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">auto Storage&lt;T, N, A&gt;::Resize(ValueAdapter values, size_type new_size) -&gt; void {
+  StorageView storage_view = MakeStorageView();
+  auto* const base = storage_view.data;
+  const size_type size = storage_view.size;
+  auto* alloc = GetAllocPtr();
+  if (new_size &lt;= size) {
+    // Destroy extra old elements.
+    inlined_vector_internal::DestroyElements(alloc, base + new_size,
+                                             size - new_size);
+  } else if (new_size &lt;= storage_view.capacity) {
+    // Construct new elements in place.
+    inlined_vector_internal::ConstructElements(alloc, base + size, &amp;values,
+                                               new_size - size);
+  } else {
+  ...
+  }
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Fast path for common cases of initializing 1-D to 4-D
+tensors.</p>
+</summary>
+
+<p>tensor_shape.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">template &lt;class Shape&gt;
+TensorShapeBase&lt;Shape&gt;::TensorShapeBase(gtl::ArraySlice&lt;int64&gt; dim_sizes) {
+  set_tag(REP16);
+  set_data_type(DT_INVALID);
+  set_ndims_byte(0);
+  set_num_elements(1);
+  for (int64 s : dim_sizes) {
+    AddDim(internal::SubtleMustCopy(s));
+  }
+}
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">template &lt;class Shape&gt;
+void TensorShapeBase&lt;Shape&gt;::InitDims(gtl::ArraySlice&lt;int64&gt; dim_sizes) {
+  DCHECK_EQ(tag(), REP16);
+
+  // Allow sizes that are under kint64max^0.25 so that 4-way multiplication
+  // below cannot overflow.
+  static const uint64 kMaxSmall = 0xd744;
+  static_assert(kMaxSmall * kMaxSmall * kMaxSmall * kMaxSmall &lt;= kint64max,
+                &quot;bad overflow check&quot;);
+  bool large_size = false;
+  for (auto s : dim_sizes) {
+    if (s &gt; kMaxSmall) {
+      large_size = true;
+      break;
+    }
+  }
+
+  if (!large_size) {
+    // Every size fits in 16 bits; use fast-paths for dims in {1,2,3,4}.
+    uint16* dst = as16()-&gt;dims_;
+    switch (dim_sizes.size()) {
+      case 1: {
+        set_ndims_byte(1);
+        const int64 size = dim_sizes[0];
+        const bool neg = Set16(kIsPartial, dst, 0, size);
+        set_num_elements(neg ? -1 : size);
+        return;
+      }
+      case 2: {
+        set_ndims_byte(2);
+        const int64 size0 = dim_sizes[0];
+        const int64 size1 = dim_sizes[1];
+        bool neg = Set16(kIsPartial, dst, 0, size0);
+        neg |= Set16(kIsPartial, dst, 1, size1);
+        set_num_elements(neg ? -1 : (size0 * size1));
+        return;
+      }
+      case 3: {
+      ...
+      }
+      case 4: {
+      ...
+      }
+    }
+  }
+
+  set_ndims_byte(0);
+  set_num_elements(1);
+  for (int64 s : dim_sizes) {
+    AddDim(internal::SubtleMustCopy(s));
+  }
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Make varint parser fast path cover just the 1-byte case,
+instead of covering 1-byte and 2-byte cases.</p>
+</summary>
+
+<p>Reducing the size of the (inlined) fast path reduces code size and icache
+pressure, which leads to improved performance.</p>
+<p>parse_context.h</p>
+
+<div class="old">
+<pre><code class="language-c++">template &lt;typename T&gt;
+PROTOBUF_NODISCARD const char* VarintParse(const char* p, T* out) {
+  auto ptr = reinterpret_cast&lt;const uint8_t*&gt;(p);
+  uint32_t res = ptr[0];
+  if (!(res &amp; 0x80)) {
+    *out = res;
+    return p + 1;
+  }
+  uint32_t byte = ptr[1];
+  res += (byte - 1) &lt;&lt; 7;
+  if (!(byte &amp; 0x80)) {
+    *out = res;
+    return p + 2;
+  }
+  return VarintParseSlow(p, res, out);
+}
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">template &lt;typename T&gt;
+PROTOBUF_NODISCARD const char* VarintParse(const char* p, T* out) {
+  auto ptr = reinterpret_cast&lt;const uint8_t*&gt;(p);
+  uint32_t res = ptr[0];
+  if (!(res &amp; 0x80)) {
+    *out = res;
+    return p + 1;
+  }
+  return VarintParseSlow(p, res, out);
+}
+</code></pre>
+</div>
+
+<p>parse_context.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">std::pair&lt;const char*, uint32_t&gt; VarintParseSlow32(const char* p,
+                                                   uint32_t res) {
+  for (std::uint32_t i = 2; i &lt; 5; i++) {
+  ...
+}
+...
+std::pair&lt;const char*, uint64_t&gt; VarintParseSlow64(const char* p,
+                                                   uint32_t res32) {
+  uint64_t res = res32;
+  for (std::uint32_t i = 2; i &lt; 10; i++) {
+  ...
+}
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">std::pair&lt;const char*, uint32_t&gt; VarintParseSlow32(const char* p,
+                                                   uint32_t res) {
+  for (std::uint32_t i = 1; i &lt; 5; i++) {
+  ...
+}
+...
+std::pair&lt;const char*, uint64_t&gt; VarintParseSlow64(const char* p,
+                                                   uint32_t res32) {
+  uint64_t res = res32;
+  for (std::uint32_t i = 1; i &lt; 10; i++) {
+  ...
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Skip significant work in RPC_Stats_Measurement addition if
+no errors have occurred.</p>
+</summary>
+
+<p>rpc-stats.h</p>
+
+<div class="old">
+<pre><code class="language-c++">struct RPC_Stats_Measurement {
+  ...
+  double errors[RPC::NUM_ERRORS];
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">struct RPC_Stats_Measurement {
+  ...
+  double get_errors(int index) const { return errors[index]; }
+  void set_errors(int index, double value) {
+    errors[index] = value;
+    any_errors_set = true;
+  }
+ private:
+  ...
+  // We make this private so that we can keep track of whether any of
+  // these values have been set to non-zero values.
+  double errors[RPC::NUM_ERRORS];
+  bool any_errors_set;  // True iff any of the errors[i] values are non-zero
+</code></pre>
+</div>
+
+<p>rpc-stats.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">void RPC_Stats_Measurement::operator+=(const RPC_Stats_Measurement&amp; x) {
+  ...
+  for (int i = 0; i &lt; RPC::NUM_ERRORS; ++i) {
+    errors[i] += x.errors[i];
+  }
+}
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">void RPC_Stats_Measurement::operator+=(const RPC_Stats_Measurement&amp; x) {
+  ...
+  if (x.any_errors_set) {
+    for (int i = 0; i &lt; RPC::NUM_ERRORS; ++i) {
+      errors[i] += x.errors[i];
+    }
+    any_errors_set = true;
+  }
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Do array lookup on first byte of string to often avoid
+fingerprinting full string.</p>
+</summary>
+
+<p>soft-tokens-helper.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">bool SoftTokensHelper::IsSoftToken(const StringPiece&amp; token) const {
+  return soft_tokens_.find(Fingerprint(token.data(), token.size())) !=
+      soft_tokens_.end();
+}
+</code></pre>
+</div>
+
+<p>soft-tokens-helper.h</p>
+
+<div class="new">
+<pre><code class="language-c++">class SoftTokensHelper {
+ ...
+ private:
+  ...
+  // Since soft tokens are mostly punctuation-related, for performance
+  // purposes, we keep an array filter_.  filter_[i] is true iff any
+  // of the soft tokens start with the byte value 'i'.  This avoids
+  // fingerprinting a term in the common case, since we can just do an array
+  // lookup based on the first byte, and if filter_[b] is false, then
+  // we can return false immediately.
+  bool          filter_[256];
+  ...
+};
+
+inline bool SoftTokensHelper::IsSoftToken(const StringPiece&amp; token) const {
+  if (token.size() &gt;= 1) {
+    char first_char = token.data()[0];
+    if (!filter_[first_char]) {
+      return false;
+    }
+  }
+  return IsSoftTokenFallback(token);
+}
+</code></pre>
+</div>
+
+<p>soft-tokens-helper.cc</p>
+
+<div class="new">
+<pre><code class="language-c++">bool SoftTokensHelper::IsSoftTokenFallback(const StringPiece&amp; token) const {
+  return soft_tokens_.find(Fingerprint(token.data(), token.size())) !=
+      soft_tokens_.end();
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+### Precompute expensive information once
+
+
+<details class="zippy">
+<summary><p>Precompute a TensorFlow graph execution node property
+that allows us to quickly rule out certain unusual cases.</p>
+</summary>
+
+<p>executor.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">struct NodeItem {
+  ...
+  bool kernel_is_expensive = false;  // True iff kernel-&gt;IsExpensive()
+  bool kernel_is_async = false;      // True iff kernel-&gt;AsAsync() != nullptr
+  bool is_merge = false;             // True iff IsMerge(node)
+  ...
+  if (IsEnter(node)) {
+  ...
+  } else if (IsExit(node)) {
+  ...
+  } else if (IsNextIteration(node)) {
+  ...
+  } else {
+    // Normal path for most nodes
+    ...
+  }
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">struct NodeItem {
+  ...
+  bool kernel_is_expensive : 1;  // True iff kernel-&gt;IsExpensive()
+  bool kernel_is_async : 1;      // True iff kernel-&gt;AsAsync() != nullptr
+  bool is_merge : 1;             // True iff IsMerge(node)
+  bool is_enter : 1;             // True iff IsEnter(node)
+  bool is_exit : 1;              // True iff IsExit(node)
+  bool is_control_trigger : 1;   // True iff IsControlTrigger(node)
+  bool is_sink : 1;              // True iff IsSink(node)
+  // True iff IsEnter(node) || IsExit(node) || IsNextIteration(node)
+  bool is_enter_exit_or_next_iter : 1;
+  ...
+  if (!item-&gt;is_enter_exit_or_next_iter) {
+    // Fast path for nodes types that don't need special handling
+    DCHECK_EQ(input_frame, output_frame);
+    ...
+  } else if (item-&gt;is_enter) {
+  ...
+  } else if (item-&gt;is_exit) {
+  ...
+  } else {
+    DCHECK(IsNextIteration(node));
+    ...
+  }
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Precompute 256 element array and use during trigram
+initialization.</p>
+</summary>
+
+<p>byte_trigram_classifier.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">void ByteTrigramClassifier::VerifyModel(void) const {
+  ProbT class_sums[num_classes_];
+  for (int cls = 0; cls &lt; num_classes_; cls++) {
+    class_sums[cls] = 0;
+  }
+  for (ByteNgramId id = 0; id &lt; trigrams_.num_trigrams(); id++) {
+    for (int cls = 0; cls &lt; num_classes_; ++cls) {
+      class_sums[cls] += Prob(trigram_probs_[id].log_probs[cls]);
+    }
+  }
+  ...
+}                         
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">void ByteTrigramClassifier::VerifyModel(void) const {
+  CHECK_EQ(sizeof(ByteLogProbT), 1);
+  ProbT fast_prob[256];
+  for (int b = 0; b &lt; 256; b++) {
+    fast_prob[b] = Prob(static_cast&lt;ByteLogProbT&gt;(b));
+  }
+
+  ProbT class_sums[num_classes_];
+  for (int cls = 0; cls &lt; num_classes_; cls++) {
+    class_sums[cls] = 0;
+  }
+  for (ByteNgramId id = 0; id &lt; trigrams_.num_trigrams(); id++) {
+    for (int cls = 0; cls &lt; num_classes_; ++cls) {
+      class_sums[cls] += fast_prob[trigram_probs_[id].log_probs[cls]];
+    }
+  }
+  ...
+}                         
+</code></pre>
+</div>
+
+
+</details>
+
+
+General advice: check for malformed inputs at module boundaries instead of
+repeating checks internally.
+
+### Move expensive computations outside loops
+
+
+<details class="zippy">
+<summary><p>Move bounds computation outside loop.</p>
+</summary>
+
+<p>literal_linearizer.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">for (int64 i = 0; i &lt; src_shape.dimensions(dimension_numbers.front());
+     ++i) {
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">int64 dim_front = src_shape.dimensions(dimension_numbers.front());
+const uint8* src_buffer_data = src_buffer.data();
+uint8* dst_buffer_data = dst_buffer.data();
+for (int64 i = 0; i &lt; dim_front; ++i) {
+</code></pre>
+</div>
+
+
+</details>
+
+
+### Defer expensive computation {#defer-expensive-computation}
+
+
+<details class="zippy">
+<summary><p>Defer GetSubSharding call until needed, which reduces 43
+seconds of CPU time to 2 seconds.</p>
+</summary>
+
+<p>sharding_propagation.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">HloSharding alternative_sub_sharding =
+    user.sharding().GetSubSharding(user.shape(), {i});
+if (user.operand(i) == &amp;instruction &amp;&amp;
+    hlo_sharding_util::IsShardingMoreSpecific(alternative_sub_sharding,
+                                              sub_sharding)) {
+  sub_sharding = alternative_sub_sharding;
+}
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">if (user.operand(i) == &amp;instruction) {
+  // Only evaluate GetSubSharding if this operand is of interest,
+  // as it is relatively expensive.
+  HloSharding alternative_sub_sharding =
+      user.sharding().GetSubSharding(user.shape(), {i});
+  if (hlo_sharding_util::IsShardingMoreSpecific(
+          alternative_sub_sharding, sub_sharding)) {
+    sub_sharding = alternative_sub_sharding;
+  }
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Don't update stats eagerly; compute them on demand.</p>
+</summary>
+
+<p>Do not update stats on the very frequent allocation/deallocation calls. Instead,
+compute stats on demand when the much less frequently called Stats() method is
+invoked.</p>
+
+</details>
+
+
+
+
+
+<details class="zippy">
+<summary><p>Preallocate 10 nodes not 200 for query handling in Google's
+web server.</p>
+</summary>
+
+<p>A simple change that reduced web server's CPU usage by 7.5%.</p>
+<p>querytree.h</p>
+
+<div class="old">
+<pre><code class="language-c++">static const int kInitParseTreeSize = 200;   // initial size of querynode pool
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">static const int kInitParseTreeSize = 10;   // initial size of querynode pool
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Change search order for 19% throughput improvement.</p>
+</summary>
+
+<p>An old search system (circa 2000) had two tiers: one contained a full-text
+index, and the other tier contained just the index for the title and anchor
+terms. We used to search the smaller title/anchor tier first.
+Counter-intuitively, we found that it is cheaper to search the larger full-text
+index tier first since if we reach the end of the full-text tier, we can
+entirely skip searching the title/anchor tier (a subset of the full-text tier).
+This happened reasonably often and allowed us to reduce the average number of
+disk seeks to process a query.</p>
+<p>See discussion of title and anchor text handling in
+<a href="https://research.google/pubs/the-anatomy-of-a-large-scale-hypertextual-web-search-engine/">The Anatomy of a Large-Scale Hypertextual Web Search Engine</a>
+for background information.</p>
+
+</details>
+
+
+### Specialize code
+
+A particular performance-sensitive call-site may not need the full generality
+provided by a general-purpose library. Consider writing specialized code in such
+cases instead of calling the general-purpose code if it provides a performance
+improvement.
+
+
+<details class="zippy">
+<summary><p>Custom printing code for Histogram class is 4x as fast as
+sprintf.</p>
+</summary>
+
+<p>This code is performance sensitive because it is invoked when monitoring systems
+gather statistics from various servers.</p>
+<p>histogram_export.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">void Histogram::PopulateBuckets(const string &amp;prefix,
+                                expvar::MapProto *const var) const {
+                                ...
+  for (int i = min_bucket; i &lt;= max_bucket; ++i) {
+    const double count = BucketCount(i);
+    if (!export_empty_buckets &amp;&amp; count == 0.0) continue;
+    acc += count;
+    // The label format of exported buckets for discrete histograms
+    // specifies an inclusive upper bound, which is the same as in
+    // the original Histogram implementation.  This format is not
+    // applicable to non-discrete histograms, so a half-open interval
+    // is used for them, with &quot;_&quot; instead of &quot;-&quot; as a separator to
+    // make possible to distinguish the formats.
+    string key =
+        options_.export_cumulative_counts() ?
+            StringPrintf(&quot;%.12g&quot;, boundaries_-&gt;BucketLimit(i)) :
+        options_.discrete() ?
+            StringPrintf(&quot;%.0f-%.0f&quot;,
+                         ceil(boundaries_-&gt;BucketStart(i)),
+                         ceil(boundaries_-&gt;BucketLimit(i)) - 1.0) :
+            StringPrintf(&quot;%.12g_%.12g&quot;,
+                         boundaries_-&gt;BucketStart(i),
+                         boundaries_-&gt;BucketLimit(i));
+    EscapeMapKey(&amp;key);
+    const double value = options_.export_cumulative_counts() ? acc : count;
+    expvar::AddMapFloat(StrCat(prefix,
+                               options_.export_bucket_key_prefix(),
+                               key),
+                        value * count_mult,
+                        var);
+  }
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">// Format &quot;val&quot; according to format.  If &quot;need_escape&quot; is true, then the
+// format can produce output with a '.' in it, and the result will be escaped.
+// If &quot;need_escape&quot; is false, then the caller guarantees that format is
+// such that the resulting number will not have any '.' characters and
+// therefore we can avoid calling EscapeKey.
+// The function is free to use &quot;*scratch&quot; for scratch space if necessary,
+// and the resulting StringPiece may point into &quot;*scratch&quot;.
+static StringPiece FormatNumber(const char* format,
+                                bool need_escape,
+                                double val, string* scratch) {
+  // This routine is specialized to work with only a limited number of formats
+  DCHECK(StringPiece(format) == &quot;%.0f&quot; || StringPiece(format) == &quot;%.12g&quot;);
+
+  scratch-&gt;clear();
+  if (val == trunc(val) &amp;&amp; val &gt;= kint32min &amp;&amp; val &lt;= kint32max) {
+    // An integer for which we can just use StrAppend
+    StrAppend(scratch, static_cast&lt;int32&gt;(val));
+    return StringPiece(*scratch);
+  } else if (isinf(val)) {
+    // Infinity, represent as just 'inf'.
+    return StringPiece(&quot;inf&quot;, 3);
+  } else {
+    // Format according to &quot;format&quot;, and possibly escape.
+    StringAppendF(scratch, format, val);
+    if (need_escape) {
+      EscapeMapKey(scratch);
+    } else {
+      DCHECK(!StringPiece(*scratch).contains(&quot;.&quot;));
+    }
+    return StringPiece(*scratch);
+  }
+}
+...
+void Histogram::PopulateBuckets(const string &amp;prefix,
+                                expvar::MapProto *const var) const {
+                                ...
+  const string full_key_prefix = StrCat(prefix,
+                                        options_.export_bucket_key_prefix());
+  string key = full_key_prefix;  // Keys will start with &quot;full_key_prefix&quot;.
+  string start_scratch;
+  string limit_scratch;
+  const bool cumul_counts = options_.export_cumulative_counts();
+  const bool discrete = options_.discrete();
+  for (int i = min_bucket; i &lt;= max_bucket; ++i) {
+    const double count = BucketCount(i);
+    if (!export_empty_buckets &amp;&amp; count == 0.0) continue;
+    acc += count;
+    // The label format of exported buckets for discrete histograms
+    // specifies an inclusive upper bound, which is the same as in
+    // the original Histogram implementation.  This format is not
+    // applicable to non-discrete histograms, so a half-open interval
+    // is used for them, with &quot;_&quot; instead of &quot;-&quot; as a separator to
+    // make possible to distinguish the formats.
+    key.resize(full_key_prefix.size());  // Start with full_key_prefix.
+    DCHECK_EQ(key, full_key_prefix);
+
+    const double limit = boundaries_-&gt;BucketLimit(i);
+    if (cumul_counts) {
+      StrAppend(&amp;key, FormatNumber(&quot;%.12g&quot;, true, limit, &amp;limit_scratch));
+    } else {
+      const double start = boundaries_-&gt;BucketStart(i);
+      if (discrete) {
+        StrAppend(&amp;key,
+                  FormatNumber(&quot;%.0f&quot;, false, ceil(start), &amp;start_scratch),
+                  &quot;-&quot;,
+                  FormatNumber(&quot;%.0f&quot;, false, ceil(limit) - 1.0,
+                               &amp;limit_scratch));
+      } else {
+        StrAppend(&amp;key,
+                  FormatNumber(&quot;%.12g&quot;, true, start, &amp;start_scratch),
+                  &quot;_&quot;,
+                  FormatNumber(&quot;%.12g&quot;, true, limit, &amp;limit_scratch));
+      }
+    }
+    const double value = cumul_counts ? acc : count;
+
+    // Add to map var
+    expvar::AddMapFloat(key, value * count_mult, var);
+  }
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Add specializations for VLOG(1), VLOG(2), … for speed and
+smaller code size.</p>
+</summary>
+
+<p><code>VLOG</code> is a heavily used macro throughout the code base. This change avoids
+passing an extra integer constant at nearly every call site (if the log level is
+constant at the call site, as it almost always is, as in <code>VLOG(1) &lt;&lt; ...</code>),
+which saves code space.</p>
+<p>vlog_is_on.h</p>
+
+<div class="old">
+<pre><code class="language-c++">class VLogSite final {
+ public:
+  ...
+  bool IsEnabled(int level) {
+    int stale_v = v_.load(std::memory_order_relaxed);
+    if (ABSL_PREDICT_TRUE(level &gt; stale_v)) {
+      return false;
+    }
+
+    // We put everything other than the fast path, i.e. vlogging is initialized
+    // but not on, behind an out-of-line function to reduce code size.
+    return SlowIsEnabled(stale_v, level);
+  }
+  ...
+ private:
+  ...
+  ABSL_ATTRIBUTE_NOINLINE
+  bool SlowIsEnabled(int stale_v, int level);
+  ...
+};
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">class VLogSite final {
+ public:
+  ...
+  bool IsEnabled(int level) {
+    int stale_v = v_.load(std::memory_order_relaxed);
+    if (ABSL_PREDICT_TRUE(level &gt; stale_v)) {
+      return false;
+    }
+
+    // We put everything other than the fast path, i.e. vlogging is initialized
+    // but not on, behind an out-of-line function to reduce code size.
+    // &quot;level&quot; is almost always a call-site constant, so we can save a bit
+    // of code space by special-casing for levels 1, 2, and 3.
+#if defined(__has_builtin) &amp;&amp; __has_builtin(__builtin_constant_p)
+    if (__builtin_constant_p(level)) {
+      if (level == 0) return SlowIsEnabled0(stale_v);
+      if (level == 1) return SlowIsEnabled1(stale_v);
+      if (level == 2) return SlowIsEnabled2(stale_v);
+      if (level == 3) return SlowIsEnabled3(stale_v);
+      if (level == 4) return SlowIsEnabled4(stale_v);
+      if (level == 5) return SlowIsEnabled5(stale_v);
+    }
+#endif
+    return SlowIsEnabled(stale_v, level);
+    ...
+ private:
+  ...
+  ABSL_ATTRIBUTE_NOINLINE
+  bool SlowIsEnabled(int stale_v, int level);
+  ABSL_ATTRIBUTE_NOINLINE bool SlowIsEnabled0(int stale_v);
+  ABSL_ATTRIBUTE_NOINLINE bool SlowIsEnabled1(int stale_v);
+  ABSL_ATTRIBUTE_NOINLINE bool SlowIsEnabled2(int stale_v);
+  ABSL_ATTRIBUTE_NOINLINE bool SlowIsEnabled3(int stale_v);
+  ABSL_ATTRIBUTE_NOINLINE bool SlowIsEnabled4(int stale_v);
+  ABSL_ATTRIBUTE_NOINLINE bool SlowIsEnabled5(int stale_v);
+  ...
+};
+</code></pre>
+</div>
+
+<p>vlog_is_on.cc</p>
+
+<div class="new">
+<pre><code class="language-c++">bool VLogSite::SlowIsEnabled0(int stale_v) { return SlowIsEnabled(stale_v, 0); }
+bool VLogSite::SlowIsEnabled1(int stale_v) { return SlowIsEnabled(stale_v, 1); }
+bool VLogSite::SlowIsEnabled2(int stale_v) { return SlowIsEnabled(stale_v, 2); }
+bool VLogSite::SlowIsEnabled3(int stale_v) { return SlowIsEnabled(stale_v, 3); }
+bool VLogSite::SlowIsEnabled4(int stale_v) { return SlowIsEnabled(stale_v, 4); }
+bool VLogSite::SlowIsEnabled5(int stale_v) { return SlowIsEnabled(stale_v, 5); }
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Replace RE2 call with a simple prefix match when possible.</p>
+</summary>
+
+<p>read_matcher.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">enum MatchItemType {
+  MATCH_TYPE_INVALID,
+  MATCH_TYPE_RANGE,
+  MATCH_TYPE_EXACT,
+  MATCH_TYPE_REGEXP,
+};
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">enum MatchItemType {
+  MATCH_TYPE_INVALID,
+  MATCH_TYPE_RANGE,
+  MATCH_TYPE_EXACT,
+  MATCH_TYPE_REGEXP,
+  MATCH_TYPE_PREFIX,   // Special type for regexp &quot;.*&quot;
+};
+</code></pre>
+</div>
+
+<p>read_matcher.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">p-&gt;type = MATCH_TYPE_REGEXP;
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">term.NonMetaPrefix().CopyToString(&amp;p-&gt;prefix);
+if (term.RegexpSuffix() == &quot;.*&quot;) {
+  // Special case for a regexp that matches anything, so we can
+  // bypass RE2::FullMatch
+  p-&gt;type = MATCH_TYPE_PREFIX;
+} else {
+  p-&gt;type = MATCH_TYPE_REGEXP;
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Use StrCat rather than StringPrintf to format IP
+addresses.</p>
+</summary>
+
+<p>ipaddress.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">string IPAddress::ToString() const {
+  char buf[INET6_ADDRSTRLEN];
+
+  switch (address_family_) {
+    case AF_INET:
+      CHECK(inet_ntop(AF_INET, &amp;addr_.addr4, buf, INET6_ADDRSTRLEN) != NULL);
+      return buf;
+    case AF_INET6:
+      CHECK(inet_ntop(AF_INET6, &amp;addr_.addr6, buf, INET6_ADDRSTRLEN) != NULL);
+      return buf;
+    case AF_UNSPEC:
+      LOG(DFATAL) &lt;&lt; &quot;Calling ToString() on an empty IPAddress&quot;;
+      return &quot;&quot;;
+    default:
+      LOG(FATAL) &lt;&lt; &quot;Unknown address family &quot; &lt;&lt; address_family_;
+  }
+}
+...
+string IPAddressToURIString(const IPAddress&amp; ip) {
+  switch (ip.address_family()) {
+    case AF_INET6:
+      return StringPrintf(&quot;[%s]&quot;, ip.ToString().c_str());
+    default:
+      return ip.ToString();
+  }
+}
+...
+string SocketAddress::ToString() const {
+  return IPAddressToURIString(host_) + StringPrintf(&quot;:%u&quot;, port_);
+}
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">string IPAddress::ToString() const {
+  char buf[INET6_ADDRSTRLEN];
+
+  switch (address_family_) {
+    case AF_INET: {
+      uint32 addr = gntohl(addr_.addr4.s_addr);
+      int a1 = static_cast&lt;int&gt;((addr &gt;&gt; 24) &amp; 0xff);
+      int a2 = static_cast&lt;int&gt;((addr &gt;&gt; 16) &amp; 0xff);
+      int a3 = static_cast&lt;int&gt;((addr &gt;&gt; 8) &amp; 0xff);
+      int a4 = static_cast&lt;int&gt;(addr &amp; 0xff);
+      return StrCat(a1, &quot;.&quot;, a2, &quot;.&quot;, a3, &quot;.&quot;, a4);
+    }
+    case AF_INET6:
+      CHECK(inet_ntop(AF_INET6, &amp;addr_.addr6, buf, INET6_ADDRSTRLEN) != NULL);
+      return buf;
+    case AF_UNSPEC:
+      LOG(DFATAL) &lt;&lt; &quot;Calling ToString() on an empty IPAddress&quot;;
+      return &quot;&quot;;
+    default:
+      LOG(FATAL) &lt;&lt; &quot;Unknown address family &quot; &lt;&lt; address_family_;
+  }
+}
+...
+string IPAddressToURIString(const IPAddress&amp; ip) {
+  switch (ip.address_family()) {
+    case AF_INET6:
+      return StrCat(&quot;[&quot;, ip.ToString(), &quot;]&quot;);
+    default:
+      return ip.ToString();
+  }
+}
+...
+string SocketAddress::ToString() const {
+  return StrCat(IPAddressToURIString(host_), &quot;:&quot;, port_);
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+### Use caching to avoid repeated work {#use-caching-to-avoid-repeated-work}
+
+
+<details class="zippy">
+<summary><p>Cache based on precomputed fingerprint of large
+serialized proto.</p>
+</summary>
+
+<p>dp_ops.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">InputOutputMappingProto mapping_proto;
+PLAQUE_OP_REQUIRES(
+    mapping_proto.ParseFromStringPiece(GetAttrMappingProto(state)),
+    absl::InternalError(&quot;Failed to parse InputOutputMappingProto&quot;));
+ParseMapping(mapping_proto);
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">uint64 mapping_proto_fp = GetAttrMappingProtoFp(state);
+{
+  absl::MutexLock l(&amp;fp_to_iometa_mu);
+  if (fp_to_iometa == nullptr) {
+    fp_to_iometa =
+        new absl::flat_hash_map&lt;uint64, std::unique_ptr&lt;ProgramIOMetadata&gt;&gt;;
+  }
+  auto it = fp_to_iometa-&gt;find(mapping_proto_fp);
+  if (it != fp_to_iometa-&gt;end()) {
+    io_metadata_ = it-&gt;second.get();
+  } else {
+    auto serial_proto = GetAttrMappingProto(state);
+    DCHECK_EQ(mapping_proto_fp, Fingerprint(serial_proto));
+    InputOutputMappingProto mapping_proto;
+    PLAQUE_OP_REQUIRES(
+        mapping_proto.ParseFromStringPiece(GetAttrMappingProto(state)),
+        absl::InternalError(&quot;Failed to parse InputOutputMappingProto&quot;));
+    auto io_meta = ParseMapping(mapping_proto);
+    io_metadata_ = io_meta.get();
+    (*fp_to_iometa)[mapping_proto_fp] = std::move(io_meta);
+  }
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+### Make the compiler's job easier
+
+The compiler may have trouble optimizing through layers of abstractions because
+it must make conservative assumptions about the overall behavior of the code, or
+may not make the right speed vs. size tradeoffs. The application programmer will
+often know more about the behavior of the system and can aid the compiler by
+rewriting the code to operate at a lower level. However, only do this when
+profiles show an issue since compilers will often get things right on their own.
+Looking at the generated assembly code for performance critical routines can
+help you understand if the compiler is "getting it right". Pprof provides a very
+helpful [display of source code interleaved with disassembly][annotated source]
+and annotated with performance data. 
+
+Some techniques that may be useful:
+
+1.  Avoid functions calls in hot functions (allows the compiler to avoid frame
+    setup costs).
+2.  Move slow-path code into a separate tail-called function.
+3.  Copy small amounts of data into local variables before heavy use. This can
+    let the compiler assume there is no aliasing with other data, which may
+    improve auto-vectorization and register allocation.
+4.  Hand-unroll very hot loops.
+
+
+
+
+<details class="zippy">
+<summary><p>Speed up ShapeUtil::ForEachState by replacing absl::Span
+with raw pointers to the underlying arrays.</p>
+</summary>
+
+<p>shape_util.h</p>
+
+<div class="old">
+<pre><code class="language-c++">struct ForEachState {
+  ForEachState(const Shape&amp; s, absl::Span&lt;const int64_t&gt; b,
+               absl::Span&lt;const int64_t&gt; c, absl::Span&lt;const int64_t&gt; i);
+  ~ForEachState();
+
+  const Shape&amp; shape;
+  const absl::Span&lt;const int64_t&gt; base;
+  const absl::Span&lt;const int64_t&gt; count;
+  const absl::Span&lt;const int64_t&gt; incr;
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">struct ForEachState {
+  ForEachState(const Shape&amp; s, absl::Span&lt;const int64_t&gt; b,
+               absl::Span&lt;const int64_t&gt; c, absl::Span&lt;const int64_t&gt; i);
+  inline ~ForEachState() = default;
+
+  const Shape&amp; shape;
+  // Pointers to arrays of the passed-in spans
+  const int64_t* const base;
+  const int64_t* const count;
+  const int64_t* const incr;
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Hand unroll
+<a href="https://en.wikipedia.org/wiki/Cyclic_redundancy_check">cyclic
+redundancy check</a> (CRC) computation loop.</p>
+</summary>
+
+<p>crc.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">void CRC32::Extend(uint64 *lo, uint64 *hi, const void *bytes, size_t length)
+                      const {
+                      ...
+  // Process bytes 4 at a time
+  while ((p + 4) &lt;= e) {
+    uint32 c = l ^ WORD(p);
+    p += 4;
+    l = this-&gt;table3_[c &amp; 0xff] ^
+        this-&gt;table2_[(c &gt;&gt; 8) &amp; 0xff] ^
+        this-&gt;table1_[(c &gt;&gt; 16) &amp; 0xff] ^
+        this-&gt;table0_[c &gt;&gt; 24];
+  }
+
+  // Process the last few bytes
+  while (p != e) {
+    int c = (l &amp; 0xff) ^ *p++;
+    l = this-&gt;table0_[c] ^ (l &gt;&gt; 8);
+  }
+  *lo = l;
+}
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">void CRC32::Extend(uint64 *lo, uint64 *hi, const void *bytes, size_t length)
+                      const {
+                      ...
+#define STEP {                                  \
+    uint32 c = l ^ WORD(p);                     \
+    p += 4;                                     \
+    l = this-&gt;table3_[c &amp; 0xff] ^               \
+        this-&gt;table2_[(c &gt;&gt; 8) &amp; 0xff] ^        \
+        this-&gt;table1_[(c &gt;&gt; 16) &amp; 0xff] ^       \
+        this-&gt;table0_[c &gt;&gt; 24];                 \
+}
+
+  // Process bytes 16 at a time
+  while ((e-p) &gt;= 16) {
+    STEP;
+    STEP;
+    STEP;
+    STEP;
+  }
+
+  // Process bytes 4 at a time
+  while ((p + 4) &lt;= e) {
+    STEP;
+  }
+#undef STEP
+
+  // Process the last few bytes
+  while (p != e) {
+    int c = (l &amp; 0xff) ^ *p++;
+    l = this-&gt;table0_[c] ^ (l &gt;&gt; 8);
+  }
+  *lo = l;
+}
+
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Handle four characters at a time when parsing Spanner
+keys.</p>
+</summary>
+
+<ol>
+<li>
+<p>Hand unroll loop to deal with four characters at a time rather than using
+memchr</p>
+</li>
+<li>
+<p>Manually unroll loop for finding separated sections of name</p>
+</li>
+<li>
+<p>Go backwards to find separated portions of a name with '#' separators
+(rather than forwards) since the first part is likely the longest in the
+name.</p>
+</li>
+</ol>
+<p>key.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">void Key::InitSeps(const char* start) {
+  const char* base = &amp;rep_[0];
+  const char* limit = base + rep_.size();
+  const char* s = start;
+
+  DCHECK_GE(s, base);
+  DCHECK_LT(s, limit);
+
+  for (int i = 0; i &lt; 3; i++) {
+    s = (const char*)memchr(s, '#', limit - s);
+    DCHECK(s != NULL);
+    seps_[i] = s - base;
+    s++;
+  }
+}
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">inline const char* ScanBackwardsForSep(const char* base, const char* p) {
+  while (p &gt;= base + 4) {
+    if (p[0] == '#') return p;
+    if (p[-1] == '#') return p-1;
+    if (p[-2] == '#') return p-2;
+    if (p[-3] == '#') return p-3;
+    p -= 4;
+  }
+  while (p &gt;= base &amp;&amp; *p != '#') p--;
+  return p;
+}
+
+void Key::InitSeps(const char* start) {
+  const char* base = &amp;rep_[0];
+  const char* limit = base + rep_.size();
+  const char* s = start;
+
+  DCHECK_GE(s, base);
+  DCHECK_LT(s, limit);
+
+  // We go backwards from the end of the string, rather than forwards,
+  // since the directory name might be long and definitely doesn't contain
+  // any '#' characters.
+  const char* p = ScanBackwardsForSep(s, limit - 1);
+  DCHECK(*p == '#');
+  seps_[2] = p - base;
+  p--;
+
+  p = ScanBackwardsForSep(s, p);
+  DCHECK(*p == '#');
+  seps_[1] = p - base;
+  p--;
+
+  p = ScanBackwardsForSep(s, p);
+  DCHECK(*p == '#');
+  seps_[0] = p - base;
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Avoid frame setup costs by converting ABSL_LOG(FATAL) to
+ABSL_DCHECK(false).</p>
+</summary>
+
+<p>arena_cleanup.h</p>
+
+<div class="old">
+<pre><code class="language-c++">inline ABSL_ATTRIBUTE_ALWAYS_INLINE size_t Size(Tag tag) {
+  if (!EnableSpecializedTags()) return sizeof(DynamicNode);
+
+  switch (tag) {
+    case Tag::kDynamic:
+      return sizeof(DynamicNode);
+    case Tag::kString:
+      return sizeof(TaggedNode);
+    case Tag::kCord:
+      return sizeof(TaggedNode);
+    default:
+      ABSL_LOG(FATAL) &lt;&lt; &quot;Corrupted cleanup tag: &quot; &lt;&lt; static_cast&lt;int&gt;(tag);
+      return sizeof(DynamicNode);
+  }
+}
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">inline ABSL_ATTRIBUTE_ALWAYS_INLINE size_t Size(Tag tag) {
+  if (!EnableSpecializedTags()) return sizeof(DynamicNode);
+
+  switch (tag) {
+    case Tag::kDynamic:
+      return sizeof(DynamicNode);
+    case Tag::kString:
+      return sizeof(TaggedNode);
+    case Tag::kCord:
+      return sizeof(TaggedNode);
+    default:
+      ABSL_DCHECK(false) &lt;&lt; &quot;Corrupted cleanup tag: &quot; &lt;&lt; static_cast&lt;int&gt;(tag);
+      return sizeof(DynamicNode);
+  }
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+### Reduce stats collection costs
+
+Balance the utility of stats and other behavioral information about a system
+against the cost of maintaining that information. The extra information can
+often help people to understand and improve high-level behavior, but can also be
+costly to maintain.
+
+Stats that are not useful can be dropped altogether.
+
+
+<details class="zippy">
+<summary><p>Stop maintaining expensive stats about number of alarms and
+closures in SelectServer.</p>
+</summary>
+
+<p>Part of changes that reduce time for setting an alarm from 771 ns to 271 ns.</p>
+<p>selectserver.h</p>
+
+<div class="old">
+<pre><code class="language-c++">class SelectServer {
+ public:
+ ...
+ protected:
+  ...
+  scoped_ptr&lt;MinuteTenMinuteHourStat&gt; num_alarms_stat_;
+  ...
+  scoped_ptr&lt;MinuteTenMinuteHourStat&gt; num_closures_stat_;
+  ...
+};
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">// Selectserver class
+class SelectServer {
+ ...
+ protected:
+ ...
+};
+</code></pre>
+</div>
+
+<p>/selectserver.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">void SelectServer::AddAlarmInternal(Alarmer* alarmer,
+                                    int offset_in_ms,
+                                    int id,
+                                    bool is_periodic) {
+                                    ...
+  alarms_-&gt;insert(alarm);
+  num_alarms_stat_-&gt;IncBy(1);
+  ...
+}
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">void SelectServer::AddAlarmInternal(Alarmer* alarmer,
+                                    int offset_in_ms,
+                                    int id,
+                                    bool is_periodic) {
+                                    ...
+  alarms_-&gt;Add(alarm);
+  ...
+}
+</code></pre>
+</div>
+
+<p>/selectserver.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">void SelectServer::RemoveAlarm(Alarmer* alarmer, int id) {
+      ...
+      alarms_-&gt;erase(alarm);
+      num_alarms_stat_-&gt;IncBy(-1);
+      ...
+}
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">void SelectServer::RemoveAlarm(Alarmer* alarmer, int id) {
+      ...
+      alarms_-&gt;Remove(alarm);
+      ...
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+Often, stats or other properties can be maintained for a sample of the elements
+handled by the system (e.g., RPC requests, input records, users). Many
+subsystems use this approach (tcmalloc allocation tracking, /requestz status
+pages, Dapper samples).
+
+When sampling, consider reducing the sampling rate when appropriate.
+
+
+<details class="zippy">
+<summary><p>Maintain stats for just a sample of doc info requests.</p>
+</summary>
+
+<p>Sampling allows us to avoid touching 39 histograms and MinuteTenMinuteHour stats
+for most requests.</p>
+<p>generic-leaf-stats.cc</p>
+<pre><code class="language-c++">... code that touches 39 histograms to update various stats on every request ...
+</code></pre>
+<pre><code class="language-c++">// Add to the histograms periodically
+if (TryLockToUpdateHistogramsDocInfo(docinfo_stats, bucket)) {
+  // Returns true and grabs bucket-&gt;lock only if we should sample this
+  // request for maintaining stats
+  ... code that touches 39 histograms to update various stats ...
+  bucket-&gt;lock.Unlock();
+}
+</code></pre>
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Reduce sampling rate and make faster sampling decisions.</p>
+</summary>
+
+<p>This change reduces the sampling rate from 1 in 10 to 1 in 32. Furthermore, we
+now keep execution time stats just for the sampled events and speed up sampling
+decisions by using a power of two modulus. This code is called on every packet
+in the Google Meet video conferencing system and needed performance work to keep
+up with capacity demands during the first part of the COVID outbreak as users
+rapidly migrated to doing more online meetings.</p>
+<p>packet_executor.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">class ScopedPerformanceMeasurement {
+ public:
+  explicit ScopedPerformanceMeasurement(PacketExecutor* packet_executor)
+      : packet_executor_(packet_executor),
+        tracer_(packet_executor-&gt;packet_executor_trace_threshold_,
+                kClosureTraceName) {
+    // ThreadCPUUsage is an expensive call. At the time of writing,
+    // it takes over 400ns, or roughly 30 times slower than absl::Now,
+    // so we sample only 10% of closures to keep the cost down.
+    if (packet_executor-&gt;closures_executed_ % 10 == 0) {
+      thread_cpu_usage_start_ = base::ThreadCPUUsage();
+    }
+
+    // Sample start time after potentially making the above expensive call,
+    // so as not to pollute wall time measurements.
+    run_start_time_ = absl::Now();
+  }
+
+  ~ScopedPerformanceMeasurement() {
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">ScopedPerformanceMeasurement::ScopedPerformanceMeasurement(
+    PacketExecutor* packet_executor)
+    : packet_executor_(packet_executor),
+      tracer_(packet_executor-&gt;packet_executor_trace_threshold_,
+              kClosureTraceName) {
+  // ThreadCPUUsage is an expensive call. At the time of writing,
+  // it takes over 400ns, or roughly 30 times slower than absl::Now,
+  // so we sample only 1 in 32 closures to keep the cost down.
+  if (packet_executor-&gt;closures_executed_ % 32 == 0) {
+    thread_cpu_usage_start_ = base::ThreadCPUUsage();
+  }
+
+  // Sample start time after potentially making the above expensive call,
+  // so as not to pollute wall time measurements.
+  run_start_time_ = absl::Now();
+}
+</code></pre>
+</div>
+
+<p>packet_executor.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">~ScopedPerformanceMeasurement() {
+  auto run_end_time = absl::Now();
+  auto run_duration = run_end_time - run_start_time_;
+
+  if (thread_cpu_usage_start_.has_value()) {
+  ...
+  }
+
+  closure_execution_time-&gt;Record(absl::ToInt64Microseconds(run_duration));
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">ScopedPerformanceMeasurement::~ScopedPerformanceMeasurement() {
+  auto run_end_time = absl::Now();
+  auto run_duration = run_end_time - run_start_time_;
+
+  if (thread_cpu_usage_start_.has_value()) {
+    ...
+    closure_execution_time-&gt;Record(absl::ToInt64Microseconds(run_duration));
+  }
+</code></pre>
+</div>
+
+<p>Benchmark results:</p>
+<pre><code class="language-{.bench}">Run on (40 X 2793 MHz CPUs); 2020-03-24T20:08:19.991412535-07:00
+CPU: Intel Ivybridge with HyperThreading (20 cores) dL1:32KB dL2:256KB dL3:25MB
+Benchmark                                      Base (ns)    New (ns) Improvement
+----------------------------------------------------------------------------
+BM_PacketOverhead_mean                               224          85    +62.0%
+</code></pre>
+
+</details>
+
+
+### Avoid logging on hot code paths
+
+Logging statements can be costly, even if the logging-level for the statement
+doesn't actually log anything. E.g., `ABSL_VLOG`'s implementation requires at
+least a load and a comparison, which may be a problem in hot code paths. In
+addition, the presence of the logging code may inhibit compiler optimizations.
+Consider dropping logging entirely from hot code paths.
+
+
+<details class="zippy">
+<summary><p>Remove logging from guts of memory allocator.</p>
+</summary>
+
+<p>This was a small part of a larger change.</p>
+<p>gpu_bfc_allocator.cc</p>
+<pre><code class="language-c++">void GPUBFCAllocator::SplitChunk(...) {
+  ...
+  VLOG(6) &lt;&lt; &quot;Adding to chunk map: &quot; &lt;&lt; new_chunk-&gt;ptr;
+  ...
+}
+...
+void GPUBFCAllocator::DeallocateRawInternal(void* ptr) {
+  ...
+  VLOG(6) &lt;&lt; &quot;Chunk at &quot; &lt;&lt; c-&gt;ptr &lt;&lt; &quot; no longer in use&quot;;
+  ...
+}
+</code></pre>
+<pre><code class="language-c++">void GPUBFCAllocator::SplitChunk(...) {
+...
+}
+...
+void GPUBFCAllocator::DeallocateRawInternal(void* ptr) {
+...
+}
+</code></pre>
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Precompute whether or not logging is enabled outside a
+nested loop.</p>
+</summary>
+
+<p>image_similarity.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">for (int j = 0; j &lt; output_subimage_size_y; j++) {
+  int j1 = j - rad + output_to_integral_subimage_y;
+  int j2 = j1 + 2 * rad + 1;
+  // Create a pointer for this row's output, taking into account the offset
+  // to the full image.
+  double *image_diff_ptr = &amp;(*image_diff)(j + min_j, min_i);
+
+  for (int i = 0; i &lt; output_subimage_size_x; i++) {
+    ...
+    if (VLOG_IS_ON(3)) {
+    ...
+    }
+    ...
+  }
+}
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">const bool vlog_3 = DEBUG_MODE ? VLOG_IS_ON(3) : false;
+
+for (int j = 0; j &lt; output_subimage_size_y; j++) {
+  int j1 = j - rad + output_to_integral_subimage_y;
+  int j2 = j1 + 2 * rad + 1;
+  // Create a pointer for this row's output, taking into account the offset
+  // to the full image.
+  double *image_diff_ptr = &amp;(*image_diff)(j + min_j, min_i);
+
+  for (int i = 0; i &lt; output_subimage_size_x; i++) {
+    ...
+    if (vlog_3) {
+    ...
+    }
+  }
+}
+</code></pre>
+</div>
+
+<pre><code class="language-{.bench}">Run on (40 X 2801 MHz CPUs); 2016-05-16T15:55:32.250633072-07:00
+CPU: Intel Ivybridge with HyperThreading (20 cores) dL1:32KB dL2:256KB dL3:25MB
+Benchmark                          Base (ns)  New (ns) Improvement
+------------------------------------------------------------------
+BM_NCCPerformance/16                   29104     26372     +9.4%
+BM_NCCPerformance/64                  473235    425281    +10.1%
+BM_NCCPerformance/512               30246238  27622009     +8.7%
+BM_NCCPerformance/1k              125651445  113361991     +9.8%
+BM_NCCLimitedBoundsPerformance/16       8314      7498     +9.8%
+BM_NCCLimitedBoundsPerformance/64     143508    132202     +7.9%
+BM_NCCLimitedBoundsPerformance/512   9335684   8477567     +9.2%
+BM_NCCLimitedBoundsPerformance/1k   37223897  34201739     +8.1%
+</code></pre>
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Precompute whether logging is enabled and use the result
+in helper routines.</p>
+</summary>
+
+<p>periodic_call.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">  VLOG(1) &lt;&lt; Logid()
+          &lt;&lt; &quot;MaybeScheduleAlarmAtNextTick. Time until next real time: &quot;
+          &lt;&lt; time_until_next_real_time;
+          ...
+  uint64 next_virtual_time_ms =
+      next_virtual_time_ms_ - num_ticks * kResolutionMs;
+  CHECK_GE(next_virtual_time_ms, 0);
+  ScheduleAlarm(now, delay, next_virtual_time_ms);
+}
+
+void ScheduleNextAlarm(uint64 current_virtual_time_ms)
+    ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+  if (calls_.empty()) {
+    VLOG(1) &lt;&lt; Logid() &lt;&lt; &quot;No calls left, entering idle mode&quot;;
+    next_real_time_ = absl::InfiniteFuture();
+    return;
+  }
+  uint64 next_virtual_time_ms = FindNextVirtualTime(current_virtual_time_ms);
+  auto delay =
+      absl::Milliseconds(next_virtual_time_ms - current_virtual_time_ms);
+  ScheduleAlarm(GetClock().TimeNow(), delay, next_virtual_time_ms);
+}
+
+// An alarm scheduled by this function supersedes all previously scheduled
+// alarms. This is ensured through `scheduling_sequence_number_`.
+void ScheduleAlarm(absl::Time now, absl::Duration delay,
+                   uint64 virtual_time_ms)
+    ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+  next_real_time_ = now + delay;
+  next_virtual_time_ms_ = virtual_time_ms;
+  ++ref_count_;  // The Alarm holds a reference.
+  ++scheduling_sequence_number_;
+  VLOG(1) &lt;&lt; Logid() &lt;&lt; &quot;ScheduleAlarm. Time : &quot;
+          &lt;&lt; absl::FormatTime(&quot;%M:%S.%E3f&quot;, now, absl::UTCTimeZone())
+          &lt;&lt; &quot;, delay: &quot; &lt;&lt; delay &lt;&lt; &quot;, virtual time: &quot; &lt;&lt; virtual_time_ms
+          &lt;&lt; &quot;, refs: &quot; &lt;&lt; ref_count_
+          &lt;&lt; &quot;, seq: &quot; &lt;&lt; scheduling_sequence_number_
+          &lt;&lt; &quot;, executor: &quot; &lt;&lt; executor_;
+
+  executor_-&gt;AddAfter(
+      delay, new Alarm(this, virtual_time_ms, scheduling_sequence_number_));
+}
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">  const bool vlog_1 = VLOG_IS_ON(1);
+
+  if (vlog_1) {
+    VLOG(1) &lt;&lt; Logid()
+            &lt;&lt; &quot;MaybeScheduleAlarmAtNextTick. Time until next real time: &quot;
+            &lt;&lt; time_until_next_real_time;
+  }
+  ...
+  uint64 next_virtual_time_ms =
+      next_virtual_time_ms_ - num_ticks * kResolutionMs;
+  CHECK_GE(next_virtual_time_ms, 0);
+  ScheduleAlarm(now, delay, next_virtual_time_ms, vlog_1);
+}
+
+void ScheduleNextAlarm(uint64 current_virtual_time_ms, bool vlog_1)
+    ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+  if (calls_.empty()) {
+    if (vlog_1) {
+      VLOG(1) &lt;&lt; Logid() &lt;&lt; &quot;No calls left, entering idle mode&quot;;
+    }
+    next_real_time_ = absl::InfiniteFuture();
+    return;
+  }
+  uint64 next_virtual_time_ms = FindNextVirtualTime(current_virtual_time_ms);
+  auto delay =
+      absl::Milliseconds(next_virtual_time_ms - current_virtual_time_ms);
+  ScheduleAlarm(GetClock().TimeNow(), delay, next_virtual_time_ms, vlog_1);
+}
+
+// An alarm scheduled by this function supersedes all previously scheduled
+// alarms. This is ensured through `scheduling_sequence_number_`.
+void ScheduleAlarm(absl::Time now, absl::Duration delay,
+                   uint64 virtual_time_ms,
+                   bool vlog_1)
+    ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+  next_real_time_ = now + delay;
+  next_virtual_time_ms_ = virtual_time_ms;
+  ++ref_count_;  // The Alarm holds a reference.
+  ++scheduling_sequence_number_;
+  if (vlog_1) {
+    VLOG(1) &lt;&lt; Logid() &lt;&lt; &quot;ScheduleAlarm. Time : &quot;
+            &lt;&lt; absl::FormatTime(&quot;%M:%S.%E3f&quot;, now, absl::UTCTimeZone())
+            &lt;&lt; &quot;, delay: &quot; &lt;&lt; delay &lt;&lt; &quot;, virtual time: &quot; &lt;&lt; virtual_time_ms
+            &lt;&lt; &quot;, refs: &quot; &lt;&lt; ref_count_
+            &lt;&lt; &quot;, seq: &quot; &lt;&lt; scheduling_sequence_number_
+            &lt;&lt; &quot;, executor: &quot; &lt;&lt; executor_;
+  }
+
+  executor_-&gt;AddAfter(
+      delay, new Alarm(this, virtual_time_ms, scheduling_sequence_number_));
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+## Code size considerations {#code-size-considerations}
+
+Performance encompasses more than just runtime speed. Sometimes it is worth
+considering the effects of software choices on the size of generated code. Large
+code size means longer compile and link times, bloated binaries, more memory
+usage, more icache pressure, and other sometimes negative effects on
+microarchitectural structures like branch predictors, etc.  Thinking about these issues is especially
+important when writing low-level library code that will be used in many places,
+or when writing templated code that you expect will be instantiated for many
+different types.
+
+The techniques that are useful for reducing code size vary significantly across
+programming languages. Here are some techniques that we have found useful for
+C++ code (which can suffer from an over-use of templates and inlining).
+
+### Trim commonly inlined code
+
+Widely called functions combined with inlining can have a dramatic effect on
+code size.
+
+
+<details class="zippy">
+<summary><p>Speed up TF_CHECK_OK.</p>
+</summary>
+
+<p>Avoid creating Ok object, and save code space by doing complex formatting of
+fatal error message out of line instead of at every call site.</p>
+<p>status.h</p>
+
+<div class="old">
+<pre><code class="language-c++">#define TF_CHECK_OK(val) CHECK_EQ(::tensorflow::Status::OK(), (val))
+#define TF_QCHECK_OK(val) QCHECK_EQ(::tensorflow::Status::OK(), (val))
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">extern tensorflow::string* TfCheckOpHelperOutOfLine(
+    const ::tensorflow::Status&amp; v, const char* msg);
+inline tensorflow::string* TfCheckOpHelper(::tensorflow::Status v,
+                                           const char* msg) {
+  if (v.ok()) return nullptr;
+  return TfCheckOpHelperOutOfLine(v, msg);
+}
+#define TF_CHECK_OK(val)                                           \
+  while (tensorflow::string* _result = TfCheckOpHelper(val, #val)) \
+  LOG(FATAL) &lt;&lt; *(_result)
+#define TF_QCHECK_OK(val)                                          \
+  while (tensorflow::string* _result = TfCheckOpHelper(val, #val)) \
+  LOG(QFATAL) &lt;&lt; *(_result)
+</code></pre>
+</div>
+
+<p>status.cc</p>
+
+<div class="new">
+<pre><code class="language-c++">string* TfCheckOpHelperOutOfLine(const ::tensorflow::Status&amp; v,
+                                 const char* msg) {
+  string r(&quot;Non-OK-status: &quot;);
+  r += msg;
+  r += &quot; status: &quot;;
+  r += v.ToString();
+  // Leaks string but this is only to be used in a fatal error message
+  return new string(r);
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Shrink each RETURN_IF_ERROR call site by 79 bytes of
+code.</p>
+</summary>
+
+<ol>
+<li>Added special adaptor class for use by just RETURN_IF_ERROR.</li>
+<li>Do not construct/destruct StatusBuilder on fast path of RETURN_IF_ERROR.</li>
+<li>Do not inline some StatusBuilder methods since they are now no longer needed
+on the fast path.</li>
+<li>Avoid unnecessary ~Status call.</li>
+</ol>
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Improve performance of CHECK_GE by 4.5X and shrink code
+size from 125 bytes to 77 bytes.</p>
+</summary>
+
+<p>logging.h</p>
+
+<div class="old">
+<pre><code class="language-c++">struct CheckOpString {
+  CheckOpString(string* str) : str_(str) { }
+  ~CheckOpString() { delete str_; }
+  operator bool() const { return str_ == NULL; }
+  string* str_;
+};
+...
+#define DEFINE_CHECK_OP_IMPL(name, op) \
+  template &lt;class t1, class t2&gt; \
+  inline string* Check##name##Impl(const t1&amp; v1, const t2&amp; v2, \
+                                   const char* names) { \
+    if (v1 op v2) return NULL; \
+    else return MakeCheckOpString(v1, v2, names); \
+  } \
+  string* Check##name##Impl(int v1, int v2, const char* names);
+DEFINE_CHECK_OP_IMPL(EQ, ==)
+DEFINE_CHECK_OP_IMPL(NE, !=)
+DEFINE_CHECK_OP_IMPL(LE, &lt;=)
+DEFINE_CHECK_OP_IMPL(LT, &lt; )
+DEFINE_CHECK_OP_IMPL(GE, &gt;=)
+DEFINE_CHECK_OP_IMPL(GT, &gt; )
+#undef DEFINE_CHECK_OP_IMPL
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">struct CheckOpString {
+  CheckOpString(string* str) : str_(str) { }
+  // No destructor: if str_ is non-NULL, we're about to LOG(FATAL),
+  // so there's no point in cleaning up str_.
+  operator bool() const { return str_ == NULL; }
+  string* str_;
+};
+...
+extern string* MakeCheckOpStringIntInt(int v1, int v2, const char* names);
+
+template&lt;int, int&gt;
+string* MakeCheckOpString(const int&amp; v1, const int&amp; v2, const char* names) {
+  return MakeCheckOpStringIntInt(v1, v2, names);
+}
+...
+#define DEFINE_CHECK_OP_IMPL(name, op) \
+  template &lt;class t1, class t2&gt; \
+  inline string* Check##name##Impl(const t1&amp; v1, const t2&amp; v2, \
+                                   const char* names) { \
+    if (v1 op v2) return NULL; \
+    else return MakeCheckOpString(v1, v2, names); \
+  } \
+  inline string* Check##name##Impl(int v1, int v2, const char* names) { \
+    if (v1 op v2) return NULL; \
+    else return MakeCheckOpString(v1, v2, names); \
+  }
+DEFINE_CHECK_OP_IMPL(EQ, ==)
+DEFINE_CHECK_OP_IMPL(NE, !=)
+DEFINE_CHECK_OP_IMPL(LE, &lt;=)
+DEFINE_CHECK_OP_IMPL(LT, &lt; )
+DEFINE_CHECK_OP_IMPL(GE, &gt;=)
+DEFINE_CHECK_OP_IMPL(GT, &gt; )
+#undef DEFINE_CHECK_OP_IMPL
+</code></pre>
+</div>
+
+<p>logging.cc</p>
+
+<div class="new">
+<pre><code class="language-c++">string* MakeCheckOpStringIntInt(int v1, int v2, const char* names) {
+  strstream ss;
+  ss &lt;&lt; names &lt;&lt; &quot; (&quot; &lt;&lt; v1 &lt;&lt; &quot; vs. &quot; &lt;&lt; v2 &lt;&lt; &quot;)&quot;;
+  return new string(ss.str(), ss.pcount());
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+### Inline with care
+
+Inlining can often improve performance, but sometimes it can increase code size
+without a corresponding performance payoff (and in some case even a performance
+loss due to increased instruction cache pressure).
+
+
+<details class="zippy">
+<summary><p>Reduce inlining in TensorFlow.</p>
+</summary>
+
+<p>The change stops inlining many non-performance-sensitive functions (e.g., error
+paths and op registration code). Furthermore, slow paths of some
+performance-sensitive functions are moved into non-inlined functions.</p>
+<p>These changes reduces the size of tensorflow symbols in a typical binary by
+12.2% (8814545 bytes down to 7740233 bytes)</p>
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Protocol buffer library change. Avoid expensive inlined
+code space for encoding message length for messages ≥ 128 bytes and instead
+do a procedure call to a shared out-of-line routine.</p>
+</summary>
+
+<p>Not only makes important large binaries smaller but also faster.</p>
+<p>Bytes of generated code per line of a heavily inlined routine in one large
+binary. First number represents the total bytes generated for a particular
+source line including all locations where that code has been inlined.</p>
+<p>Before:</p>
+<pre><code class="language-c++">.           0   1825 template &lt;typename MessageType&gt;
+.           0   1826 inline uint8* WireFormatLite::InternalWriteMessage(
+.           0   1827     int field_number, const MessageType&amp; value, uint8* target,
+.           0   1828     io::EpsCopyOutputStream* stream) {
+&gt;&gt;&gt;    389246   1829   target = WriteTagToArray(field_number, WIRETYPE_LENGTH_DELIMITED, target);
+&gt;&gt;&gt;   5454640   1830   target = io::CodedOutputStream::WriteVarint32ToArray(
+&gt;&gt;&gt;    337837   1831       static_cast&lt;uint32&gt;(value.GetCachedSize()), target);
+&gt;&gt;&gt;   1285539   1832   return value._InternalSerialize(target, stream);
+.           0   1833 }
+</code></pre>
+<p>The new codesize output with this change looks like:</p>
+<pre><code class="language-c++">.           0   1825 template &lt;typename MessageType&gt;
+.           0   1826 inline uint8* WireFormatLite::InternalWriteMessage(
+.           0   1827     int field_number, const MessageType&amp; value, uint8* target,
+.           0   1828     io::EpsCopyOutputStream* stream) {
+&gt;&gt;&gt;    450612   1829   target = WriteTagToArray(field_number, WIRETYPE_LENGTH_DELIMITED, target);
+&gt;&gt;       9609   1830   target = io::CodedOutputStream::WriteVarint32ToArrayOutOfLine(
+&gt;&gt;&gt;    434668   1831       static_cast&lt;uint32&gt;(value.GetCachedSize()), target);
+&gt;&gt;&gt;   1597394   1832   return value._InternalSerialize(target, stream);
+.           0   1833 }
+</code></pre>
+<p>coded_stream.h</p>
+
+<div class="new">
+<pre><code class="language-c++">class PROTOBUF_EXPORT CodedOutputStream {
+  ...
+  // Like WriteVarint32()  but writing directly to the target array, and with the
+  // less common-case paths being out of line rather than inlined.
+  static uint8* WriteVarint32ToArrayOutOfLine(uint32 value, uint8* target);
+  ...
+};
+...
+inline uint8* CodedOutputStream::WriteVarint32ToArrayOutOfLine(uint32 value,
+                                                               uint8* target) {
+  target[0] = static_cast&lt;uint8&gt;(value);
+  if (value &lt; 0x80) {
+    return target + 1;
+  } else {
+    return WriteVarint32ToArrayOutOfLineHelper(value, target);
+  }
+}
+</code></pre>
+</div>
+
+<p>coded_stream.cc</p>
+
+<div class="new">
+<pre><code class="language-c++">uint8* CodedOutputStream::WriteVarint32ToArrayOutOfLineHelper(uint32 value,
+                                                              uint8* target) {
+  DCHECK_GE(value, 0x80);
+  target[0] |= static_cast&lt;uint8&gt;(0x80);
+  value &gt;&gt;= 7;
+  target[1] = static_cast&lt;uint8&gt;(value);
+  if (value &lt; 0x80) {
+    return target + 2;
+  }
+  target += 2;
+  do {
+    // Turn on continuation bit in the byte we just wrote.
+    target[-1] |= static_cast&lt;uint8&gt;(0x80);
+    value &gt;&gt;= 7;
+    *target = static_cast&lt;uint8&gt;(value);
+    ++target;
+  } while (value &gt;= 0x80);
+  return target;
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Reduce absl::flat_hash_set and absl::flat_hash_map code
+size.</p>
+</summary>
+
+<ol>
+<li>Extract code that does not depend on the specific hash table type into
+common (non-inlined) functions.</li>
+<li>Place ABSL_ATTRIBUTE_NOINLINE directives judiciously.</li>
+<li>Out-of-line some slow paths.</li>
+</ol>
+<p>Reduces sizes of some large binaries by ~0.5%.</p>
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Do not inline string allocation and deallocation when not
+using protobuf arenas.</p>
+</summary>
+
+<p>public/arenastring.h</p>
+
+<div class="old">
+<pre><code class="language-c++">  if (IsDefault(default_value)) {
+    std::string* new_string = new std::string();
+    tagged_ptr_.Set(new_string);
+    return new_string;
+  } else {
+    return UnsafeMutablePointer();
+  }
+}
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">  if (IsDefault(default_value)) {
+    return SetAndReturnNewString();
+  } else {
+    return UnsafeMutablePointer();
+  }
+}
+</code></pre>
+</div>
+
+<p>internal/arenastring.cc</p>
+
+<div class="new">
+<pre><code class="language-c++">std::string* ArenaStringPtr::SetAndReturnNewString() {
+  std::string* new_string = new std::string();
+  tagged_ptr_.Set(new_string);
+  return new_string;
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Avoid inlining some routines. Create variants of routines
+that take 'const char*' rather than 'const std::string&amp;' to avoid std::string
+construction code at every call site.</p>
+</summary>
+
+<p>op.h</p>
+
+<div class="old">
+<pre><code class="language-c++">class OpDefBuilderWrapper {
+ public:
+  explicit OpDefBuilderWrapper(const char name[]) : builder_(name) {}
+  OpDefBuilderWrapper&amp; Attr(std::string spec) {
+    builder_.Attr(std::move(spec));
+    return *this;
+  }
+  OpDefBuilderWrapper&amp; Input(std::string spec) {
+    builder_.Input(std::move(spec));
+    return *this;
+  }
+  OpDefBuilderWrapper&amp; Output(std::string spec) {
+    builder_.Output(std::move(spec));
+    return *this;
+  }
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">class OpDefBuilderWrapper {
+ public:
+  explicit OpDefBuilderWrapper(const char name[]) : builder_(name) {}
+  OpDefBuilderWrapper&amp; Attr(std::string spec) {
+    builder_.Attr(std::move(spec));
+    return *this;
+  }
+  OpDefBuilderWrapper&amp; Attr(const char* spec) TF_ATTRIBUTE_NOINLINE {
+    return Attr(std::string(spec));
+  }
+  OpDefBuilderWrapper&amp; Input(std::string spec) {
+    builder_.Input(std::move(spec));
+    return *this;
+  }
+  OpDefBuilderWrapper&amp; Input(const char* spec) TF_ATTRIBUTE_NOINLINE {
+    return Input(std::string(spec));
+  }
+  OpDefBuilderWrapper&amp; Output(std::string spec) {
+    builder_.Output(std::move(spec));
+    return *this;
+  }
+  OpDefBuilderWrapper&amp; Output(const char* spec) TF_ATTRIBUTE_NOINLINE {
+    return Output(std::string(spec));
+  }
+</code></pre>
+</div>
+
+
+</details>
+
+
+### Reduce template instantiations
+
+Templated code can be duplicated for every possible combination of template
+arguments when it is instantiated.
+
+
+<details class="zippy">
+<summary><p>Replace template argument with a regular argument.</p>
+</summary>
+
+<p>Changed a large routine templated on a bool to instead take the bool as an extra
+argument. (The bool was only being used once to select one of two string
+constants, so a run-time check was just fine.) This reduced the # of
+instantiations of the large routine from 287 to 143.</p>
+<p>sharding_util_ops.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">template &lt;bool Split&gt;
+Status GetAndValidateAttributes(OpKernelConstruction* ctx,
+                                std::vector&lt;int32&gt;&amp; num_partitions,
+                                int&amp; num_slices, std::vector&lt;int32&gt;&amp; paddings,
+                                bool&amp; has_paddings) {
+  absl::string_view num_partitions_attr_name =
+      Split ? kNumSplitsAttrName : kNumConcatsAttrName;
+      ...
+  return OkStatus();
+}
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">Status GetAndValidateAttributes(bool split, OpKernelConstruction* ctx,
+                                std::vector&lt;int32&gt;&amp; num_partitions,
+                                int&amp; num_slices, std::vector&lt;int32&gt;&amp; paddings,
+                                bool&amp; has_paddings) {
+  absl::string_view num_partitions_attr_name =
+      split ? kNumSplitsAttrName : kNumConcatsAttrName;
+      ...
+  return OkStatus();
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Move bulky code from templated constructor to a
+non-templated shared base class constructor.</p>
+</summary>
+
+<p>Also reduce number of template instantiations from one for every combination of
+<code>&lt;T, Device, Rank&gt;</code> to one for every <code>&lt;T&gt;</code> and every <code>&lt;Rank&gt;</code>.</p>
+<p>sharding_util_ops.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">template &lt;typename Device, typename T&gt;
+class XlaSplitNDBaseOp : public OpKernel {
+ public:
+  explicit XlaSplitNDBaseOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(
+        ctx, GetAndValidateAttributes(/*split=*/true, ctx, num_splits_,
+                                      num_slices_, paddings_, has_paddings_));
+  }
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">// Shared base class to save code space
+class XlaSplitNDShared : public OpKernel {
+ public:
+  explicit XlaSplitNDShared(OpKernelConstruction* ctx) TF_ATTRIBUTE_NOINLINE
+      : OpKernel(ctx),
+        num_slices_(1),
+        has_paddings_(false) {
+    GetAndValidateAttributes(/*split=*/true, ctx, num_splits_, num_slices_,
+                             paddings_, has_paddings_);
+  }
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Reduce generated code size for absl::flat_hash_set and
+absl::flat_hash_map.</p>
+</summary>
+
+<ul>
+<li>Extract code that does not depend on the specific hash table type into
+common (non-inlined) functions.</li>
+<li>Place ABSL_ATTRIBUTE_NOINLINE directives judiciously.</li>
+<li>Move some slow paths out of line.</li>
+</ul>
+
+</details>
+
+
+### Reduce container operations
+
+Consider the impact of map and other container operations since each call to
+such and operation can produce large amounts of generated code.
+
+
+<details class="zippy">
+<summary><p>Turn many map insertion calls in a row to initialize a
+hash table of emoji characters into a single bulk insert operation (188KB of
+text down to 360 bytes in library linked into many binaries). 😊</p>
+</summary>
+
+<p>textfallback_init.h</p>
+
+<div class="old">
+<pre><code class="language-c++">inline void AddEmojiFallbacks(TextFallbackMap *map) {
+  (*map)[0xFE000] = &amp;kFE000;
+  (*map)[0xFE001] = &amp;kFE001;
+  (*map)[0xFE002] = &amp;kFE002;
+  (*map)[0xFE003] = &amp;kFE003;
+  (*map)[0xFE004] = &amp;kFE004;
+  (*map)[0xFE005] = &amp;kFE005;
+  ...
+  (*map)[0xFEE7D] = &amp;kFEE7D;
+  (*map)[0xFEEA0] = &amp;kFEEA0;
+  (*map)[0xFE331] = &amp;kFE331;
+};
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">inline void AddEmojiFallbacks(TextFallbackMap *map) {
+#define PAIR(x) {0x##x, &amp;k##x}
+  // clang-format off
+  map-&gt;insert({
+    PAIR(FE000),
+    PAIR(FE001),
+    PAIR(FE002),
+    PAIR(FE003),
+    PAIR(FE004),
+    PAIR(FE005),
+    ...
+    PAIR(FEE7D),
+    PAIR(FEEA0),
+    PAIR(FE331)});
+  // clang-format on
+#undef PAIR
+};
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Stop inlining a heavy user of InlinedVector operations.</p>
+</summary>
+
+<p>Moved very long routine that was being inlined from .h file to .cc (no real
+performance benefit from inlining this).</p>
+<p>reduction_ops_common.h</p>
+<pre><code class="language-c++">Status Simplify(const Tensor&amp; data, const Tensor&amp; axis,
+                const bool keep_dims) {
+  ... Eighty line routine body ...
+}
+</code></pre>
+<pre><code class="language-c++">Status Simplify(const Tensor&amp; data, const Tensor&amp; axis, const bool keep_dims);
+</code></pre>
+
+</details>
+
+
+## Parallelization and synchronization {#parallelization-and-synchronization}
+
+
+
+### Exploit parallelism {#exploit-parallelism}
+
+Modern machines have many cores, and they are often underutilized. Expensive
+work may therefore be completed faster by parallelizing it. The most common
+approach is to process different items in parallel and combine the results when
+done. Typically, the items are first partitioned into batches to avoid paying
+the cost of running something in parallel per item.
+
+
+<details class="zippy">
+<summary><p>Four-way parallelization improves the rate of encoding
+tokens by ~3.6x.</p>
+</summary>
+
+<p>blocked-token-coder.cc</p>
+
+<div class="new">
+<pre><code class="language-c++">MutexLock l(&amp;encoder_threads_lock);
+if (encoder_threads == NULL) {
+  encoder_threads = new ThreadPool(NumCPUs());
+  encoder_threads-&gt;SetStackSize(262144);
+  encoder_threads-&gt;StartWorkers();
+}
+encoder_threads-&gt;Add
+    (NewCallback(this,
+                 &amp;BlockedTokenEncoder::EncodeRegionInThread,
+                 region_tokens, N, region,
+                 stats,
+                 controller_-&gt;GetClosureWithCost
+                 (NewCallback(&amp;DummyCallback), N)));
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Parallelization improves decoding performance by 5x.</p>
+</summary>
+
+<p>coding.cc</p>
+<pre><code class="language-c++">for (int c = 0; c &lt; clusters-&gt;size(); c++) {
+  RET_CHECK_OK(DecodeBulkForCluster(...);
+}
+</code></pre>
+<pre><code class="language-c++">struct SubTask {
+  absl::Status result;
+  absl::Notification done;
+};
+
+std::vector&lt;SubTask&gt; tasks(clusters-&gt;size());
+for (int c = 0; c &lt; clusters-&gt;size(); c++) {
+  options_.executor-&gt;Schedule([&amp;, c] {
+    tasks[c].result = DecodeBulkForCluster(...);
+    tasks[c].done.Notify();
+  });
+}
+for (int c = 0; c &lt; clusters-&gt;size(); c++) {
+  tasks[c].done.WaitForNotification();
+}
+for (int c = 0; c &lt; clusters-&gt;size(); c++) {
+  RETURN_IF_ERROR(tasks[c].result);
+}
+</code></pre>
+
+</details>
+
+
+The effect on system performance should be measured carefully – if spare CPU is
+not available, or if memory bandwidth is saturated, parallelization may not
+help, or may even hurt. 
+
+### Amortize lock acquisition {#amortize-lock-acquisition}
+
+Avoid fine-grained locking to reduce the cost of Mutex operations in hot paths.
+Caveat: this should only be done if the change does not increase lock
+contention.
+
+
+<details class="zippy">
+<summary><p>Acquire lock once to free entire tree of query nodes, rather
+than reacquiring lock for every node in tree.</p>
+</summary>
+
+<p>mustang-query.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">// Pool of query nodes
+ThreadSafeFreeList&lt;MustangQuery&gt; pool_(256);
+...
+void MustangQuery::Release(MustangQuery* node) {
+  if (node == NULL)
+    return;
+  for (int i=0; i &lt; node-&gt;children_-&gt;size(); ++i)
+    Release((*node-&gt;children_)[i]);
+  node-&gt;children_-&gt;clear();
+  pool_.Delete(node);
+}
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">// Pool of query nodes
+Mutex pool_lock_;
+FreeList&lt;MustangQuery&gt; pool_(256);
+...
+void MustangQuery::Release(MustangQuery* node) {
+  if (node == NULL)
+    return;
+  MutexLock l(&amp;pool_lock_);
+  ReleaseLocked(node);
+}
+
+void MustangQuery::ReleaseLocked(MustangQuery* node) {
+#ifndef NDEBUG
+  pool_lock_.AssertHeld();
+#endif
+  if (node == NULL)
+    return;
+  for (int i=0; i &lt; node-&gt;children_-&gt;size(); ++i)
+    ReleaseLocked((*node-&gt;children_)[i]);
+  node-&gt;children_-&gt;clear();
+  pool_.Delete(node);
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+### Keep critical sections short {#keep-critical-sections-short}
+
+Avoid expensive work inside critical sections. In particular, watch out for
+innocuous looking code that might be doing RPCs or accessing files.
+
+
+<details class="zippy">
+<summary><p>Reduce number of cache lines touched in critical section.</p>
+</summary>
+
+<p>Careful data structure adjustments reduce the number of cache lines accessed
+significantly and improve the performance of an ML training run by 3.3%.</p>
+<ol>
+<li>Precompute some per-node type properties as bits within the NodeItem data
+structure, meaning that we can avoid touching the Node* object for outgoing
+edges in the critical section.</li>
+<li>Change ExecutorState::ActivateNodes to use the NodeItem of the destination
+node for each outgoing edge, rather than touching fields in the *item-&gt;node
+object. Typically this means that we touch 1 or 2 cache lines total for
+accessing the needed edge data, rather than <code>~2 + O(num_outgoing edges)</code>
+(and for large graphs with many cores executing them there is also less TLB
+pressure).</li>
+</ol>
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Avoid RPC while holding Mutex.</p>
+</summary>
+
+<p>trainer.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">{
+  // Notify the parameter server that we are starting.
+  MutexLock l(&amp;lock_);
+  model_ = model;
+  MaybeRecordProgress(last_global_step_);
+}
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">bool should_start_record_progress = false;
+int64 step_for_progress = -1;
+{
+  // Notify the parameter server that we are starting.
+  MutexLock l(&amp;lock_);
+  model_ = model;
+  should_start_record_progress = ShouldStartRecordProgress();
+  step_for_progress = last_global_step_;
+}
+if (should_start_record_progress) {
+  StartRecordProgress(step_for_progress);
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+Also, be wary of expensive destructors that will run before a Mutex is unlocked
+(this can often happen when the Mutex unlock is triggered by a `~MutexUnlock`.)
+Declaring objects with expensive destructors before MutexLock may help (assuming
+it is thread-safe).
+
+### Reduce contention by sharding {#reduce-contention-by-sharding}
+
+Sometimes a data structure protected by a Mutex that is exhibiting high
+contention can be safely split into multiple shards, each shard with its own
+Mutex. (Note: this requires that there are no cross-shard invariants between the
+different shards.)
+
+
+<details class="zippy">
+<summary><p>Shard a cache 16 ways which improves throughput under a
+multi-threaded load by ~2x.</p>
+</summary>
+
+<p>cache.cc</p>
+
+<div class="new">
+<pre><code class="language-c++">class ShardedLRUCache : public Cache {
+ private:
+  LRUCache shard_[kNumShards];
+  port::Mutex id_mutex_;
+  uint64_t last_id_;
+
+  static inline uint32_t HashSlice(const Slice&amp; s) {
+    return Hash(s.data(), s.size(), 0);
+  }
+
+  static uint32_t Shard(uint32_t hash) {
+    return hash &gt;&gt; (32 - kNumShardBits);
+  }
+  ...
+  virtual Handle* Lookup(const Slice&amp; key) {
+    const uint32_t hash = HashSlice(key);
+    return shard_[Shard(hash)].Lookup(key, hash);
+  }
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Shard spanner data structure for tracking calls.</p>
+</summary>
+
+<p>transaction_manager.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">absl::MutexLock l(&amp;active_calls_in_mu_);
+ActiveCallMap::const_iterator iter = active_calls_in_.find(m-&gt;tid());
+if (iter != active_calls_in_.end()) {
+  iter-&gt;second.ExtractElements(&amp;m-&gt;tmp_calls_);
+}
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">ActiveCalls::LockedShard shard(active_calls_in_, m-&gt;tid());
+const ActiveCallMap&amp; active_calls_map = shard.active_calls_map();
+ActiveCallMap::const_iterator iter = active_calls_map.find(m-&gt;tid());
+if (iter != active_calls_map.end()) {
+  iter-&gt;second.ExtractElements(&amp;m-&gt;tmp_calls_);
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+If the data structure in question is a map, consider using a concurrent hash map
+implementation instead.
+
+Be careful with the information used for shard selection. If, for example, you
+use some bits of a hash value for shard selection and then those same bits end
+up being used again later, the latter use may perform poorly since it sees a
+skewed distribution of hash values.
+
+
+<details class="zippy">
+<summary><p>Fix information used for shard selection to prevent hash
+table issues.</p>
+</summary>
+
+<p>netmon_map_impl.h</p>
+
+<div class="old">
+<pre><code class="language-c++">ConnectionBucket* GetBucket(Index index) {
+  // Rehash the hash to make sure we are not partitioning the buckets based on
+  // the original hash. If num_buckets_ is a power of 2 that would drop the
+  // entropy of the buckets.
+  size_t original_hash = absl::Hash&lt;Index&gt;()(index);
+  int hash = absl::Hash&lt;size_t&gt;()(original_hash) % num_buckets_;
+  return &amp;buckets_[hash];
+}
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">ConnectionBucket* GetBucket(Index index) {
+  absl::Hash&lt;std::pair&lt;Index, size_t&gt;&gt; hasher{};
+  // Combine the hash with 42 to prevent shard selection using the same bits
+  // as the underlying hashtable.
+  return &amp;buckets_[hasher({index, 42}) % num_buckets_];
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Shard Spanner data structure used for tracking calls.</p>
+</summary>
+
+<p>This CL partitions the ActiveCallMap into 64 shards. Each shard is protected by
+a separate mutex. A given transaction will be mapped to exactly one shard. A new
+interface LockedShard(tid) is added for accessing the ActiveCallMap for a
+transaction in a thread-safe manner. Example usage:</p>
+<p>transaction_manager.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">{
+  absl::MutexLock l(&amp;active_calls_in_mu_);
+  delayed_locks_timer_ring_.Add(delayed_locks_flush_time_ms, tid);
+}
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">{
+  ActiveCalls::LockedShard shard(active_calls_in_, tid);
+  shard.delayed_locks_timer_ring().Add(delayed_locks_flush_time_ms, tid);
+}
+</code></pre>
+</div>
+
+<p>The results show a 69% reduction in overall wall-clock time when running the
+benchmark with 8192 fibers</p>
+<pre><code class="language-{.old}">Benchmark                   Time(ns)        CPU(ns)     Iterations
+------------------------------------------------------------------
+BM_ActiveCalls/8k        11854633492     98766564676            10
+BM_ActiveCalls/16k       26356203552    217325836709            10
+</code></pre>
+<pre><code class="language-{.new}">Benchmark                   Time(ns)        CPU(ns)     Iterations
+------------------------------------------------------------------
+BM_ActiveCalls/8k         3696794642     39670670110            10
+BM_ActiveCalls/16k        7366284437     79435705713            10
+</code></pre>
+
+</details>
+
+
+### SIMD Instructions {#simd-instructions}
+
+Explore whether handling multiple items at once using
+[SIMD](https://en.wikipedia.org/wiki/Single_instruction,_multiple_data)
+instructions available on modern CPUs can give speedups (e.g., see
+`absl::flat_hash_map` discussion below in [Bulk Operations](#bulk-operations)
+section).
+
+### Reduce false sharing {#reduce-false-sharing}
+
+If different threads access different mutable data, consider placing the
+different data items on different cache lines, e.g., in C++ using the `alignas`
+directive. However, these directives are easy to misuse and may increase object
+sizes significantly, so make sure performance measurements justify their use.
+
+
+<details class="zippy">
+<summary><p>Segregate commonly mutated fields in a different cache
+line than other fields.</p>
+</summary>
+
+<p>histogram.h</p>
+
+<div class="old">
+<pre><code class="language-c++">HistogramOptions options_;
+...
+internal::HistogramBoundaries *boundaries_;
+...
+std::vector&lt;double&gt; buckets_;
+
+double min_;             // Minimum.
+double max_;             // Maximum.
+double count_;           // Total count of occurrences.
+double sum_;             // Sum of values.
+double sum_of_squares_;  // Sum of squares of values.
+...
+RegisterVariableExporter *exporter_;
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">  HistogramOptions options_;
+  ...
+  internal::HistogramBoundaries *boundaries_;
+  ...
+  RegisterVariableExporter *exporter_;
+  ...
+  // Place the following fields in a dedicated cacheline as they are frequently
+  // mutated, so we can avoid potential false sharing.
+  ...
+#ifndef SWIG
+  alignas(ABSL_CACHELINE_SIZE)
+#endif
+  std::vector&lt;double&gt; buckets_;
+
+  double min_;             // Minimum.
+  double max_;             // Maximum.
+  double count_;           // Total count of occurrences.
+  double sum_;             // Sum of values.
+  double sum_of_squares_;  // Sum of squares of values.
+</code></pre>
+</div>
+
+
+</details>
+
+
+### Reduce frequency of context switches
+
+
+<details class="zippy">
+<summary><p>Process small work items inline instead of on device
+thread pool.</p>
+</summary>
+
+<p>cast_op.cc</p>
+
+<div class="new">
+<pre><code class="language-c++">template &lt;typename Device, typename Tout, typename Tin&gt;
+void CastMaybeInline(const Device&amp; d, typename TTypes&lt;Tout&gt;::Flat o,
+                     typename TTypes&lt;Tin&gt;::ConstFlat i) {
+  if (o.size() * (sizeof(Tin) + sizeof(Tout)) &lt; 16384) {
+    // Small cast on a CPU: do inline
+    o = i.template cast&lt;Tout&gt;();
+  } else {
+    o.device(d) = i.template cast&lt;Tout&gt;();
+  }
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+### Use buffered channels for pipelining {#use-buffered-channels-for-pipelining}
+
+Channels can be unbuffered which means that a writer blocks until a reader is
+ready to pick up an item. Unbuffered channels can be useful when the channel is
+being used for synchronization, but not when the channel is being used to
+increase parallelism. 
+
+
+
+### Consider lock-free approaches
+
+Sometimes lock-free data structures can make a difference over more conventional
+mutex-protected data structures. However, direct atomic variable manipulation
+can be [dangerous][atomic danger]. Prefer higher-level abstractions.
+
+
+<details class="zippy">
+<summary><p>Use lock-free map to manage a cache of RPC channels.</p>
+</summary>
+
+<p>Entries in an RPC stub cache are read thousands of times a second and modified
+rarely. Switching to an appropriate lock-free map reduces search latency by
+3%-5%.</p>
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Use a fixed lexicon+lock-free hash map to speed-up
+determining IsValidTokenId.</p>
+</summary>
+
+<p>dynamic_token_class_manager.h</p>
+
+<div class="old">
+<pre><code class="language-c++">mutable Mutex mutex_;
+
+// The density of this hash map is guaranteed by the fact that the
+// dynamic lexicon reuses previously allocated TokenIds before trying
+// to allocate new ones.
+dense_hash_map&lt;TokenId, common::LocalTokenClassId&gt; tid_to_cid_
+    GUARDED_BY(mutex_);
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">// Read accesses to this hash-map should be done using
+// 'epoch_gc_'::(EnterFast / LeaveFast). The writers should periodically
+// GC the deleted entries, by simply invoking LockFreeHashMap::CreateGC.
+typedef util::gtl::LockFreeHashMap&lt;TokenId, common::LocalTokenClassId&gt;
+    TokenIdTokenClassIdMap;
+TokenIdTokenClassIdMap tid_to_cid_;
+</code></pre>
+</div>
+
+
+</details>
+
+
+## Protocol Buffer advice {#protobuf-advice}
+
+Protobufs are a convenient representation of data, especially if the data will
+be sent over the wire or stored persistently. However, they can have significant
+performance costs. For example, a piece of code that fills in a list of 1000
+points and then sums up the Y coordinates, speeds up by a **factor of 20** when
+converted from protobufs to a C++ std::vector of structs!
+
+
+<details class="zippy">
+<summary><p>Benchmark code for both versions.</p>
+</summary>
+
+<pre><code class="language-{.bench}">name                old time/op  new time/op  delta
+BenchmarkIteration  17.4µs ± 5%   0.8µs ± 1%  -95.30%  (p=0.000 n=11+12)
+</code></pre>
+<p>Protobuf version:</p>
+<pre><code class="language-proto">message PointProto {
+  int32 x = 1;
+  int32 y = 2;
+}
+message PointListProto {
+  repeated PointProto points = 1;
+}
+</code></pre>
+<pre><code class="language-c++">void SumProto(const PointListProto&amp; vec) {
+  int sum = 0;
+  for (const PointProto&amp; p : vec.points()) {
+    sum += p.y();
+  }
+  ABSL_VLOG(1) &lt;&lt; sum;
+}
+
+void BenchmarkIteration() {
+  PointListProto points;
+  points.mutable_points()-&gt;Reserve(1000);
+  for (int i = 0; i &lt; 1000; i++) {
+    PointProto* p = points.add_points();
+    p-&gt;set_x(i);
+    p-&gt;set_y(i * 2);
+  }
+  SumProto(points);
+}
+</code></pre>
+<p>Non-protobuf version:</p>
+<pre><code class="language-c++">struct PointStruct {
+  int x;
+  int y;
+};
+
+void SumVector(const std::vector&lt;PointStruct&gt;&amp; vec) {
+  int sum = 0;
+  for (const PointStruct&amp; p : vec) {
+    sum += p.y;
+  }
+  ABSL_VLOG(1) &lt;&lt; sum;
+}
+
+void BenchmarkIteration() {
+  std::vector&lt;PointStruct&gt; points;
+  points.reserve(1000);
+  for (int i = 0; i &lt; 1000; i++) {
+    points.push_back({i, i * 2});
+  }
+  SumVector(points);
+}
+</code></pre>
+
+</details>
+
+
+In addition, the protobuf version adds a few kilobytes of code and data to the
+binary, which may not seem like much, but adds up quickly in systems with many
+protobuf types. This increased size creates performance problems by creating
+i-cache and d-cache pressure.
+
+Here are some tips related to protobuf performance:
+
+
+<details class="zippy">
+<summary><p>Do not use protobufs unnecessarily.</p>
+</summary>
+
+<p>Given the factor of 20 performance difference described above, if some data is
+never serialized or parsed, you probably should not put it in a protocol buffer.
+The purpose of protocol buffers is to make it easy to serialize and deserialize
+data structures, but they can have significant code-size, memory, and CPU
+overheads. Do not use them if all you want are some of the other niceties like
+<code>DebugString</code> and copyability.</p>
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Avoid unnecessary message hierarchies.</p>
+</summary>
+
+<p>Message hierarchy can be useful to organize information in a more readable
+fashion. However, the extra level of message hierarchy incurs overheads like
+memory allocations, function calls, cache misses, larger serialized messages,
+etc.</p>
+<p>E.g., instead of:</p>
+<pre><code class="language-proto">message Foo {
+  optional Bar bar = 1;
+}
+message Bar {
+  optional Baz baz = 1;
+}
+message Baz {
+  optional int32 count = 1;
+}
+</code></pre>
+<p>Prefer:</p>
+<pre><code class="language-proto">message Foo {
+  optional int32 count = 1;
+}
+</code></pre>
+<p>A protocol buffer message corresponds to a message class in C++ generated code
+and emits a tag and the length of the payload on the wire. To carry an integer,
+the old form requires more allocations (and deallocations) and emits a larger
+amount of generated code. As a result, all protocol buffer operations (parsing,
+serialization, size, etc.) become more expensive, having to traverse the message
+tree. The new form does not have such overhead and is more efficient.</p>
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Use small field numbers for frequently occurring fields.</p>
+</summary>
+
+<p>Protobufs use a variable length integer representation for the combination of
+field number and wire format (see the
+<a href="https://protobuf.dev/programming-guides/encoding/">protobuf encoding documentation</a>).
+This representation is 1 byte for field numbers between 1 and 15, and two bytes
+for field numbers between 16 and 2047. (Field numbers 2048 or greater should
+typically be avoided.)</p>
+<p>Consider pre-reserving some small field numbers for future extension of
+performance-sensitive protobufs.</p>
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Choose carefully between int32, sint32, fixed32, and uint32 (and
+similarly for the 64 bit variants).</p>
+</summary>
+
+<p>Generally, use <code>int32</code> or <code>int64</code>, but use <code>fixed32</code> or <code>fixed64</code> for large
+values like hash codes and <code>sint32</code> or <code>sint64</code> for values are that are often
+negative.</p>
+<p>A varint occupies fewer bytes to encode small integers and can save space at the
+cost of more expensive decoding. However, it can take up more space for negative
+or large values. In that case, using fixed32 or fixed64 (instead of uint32 or
+uint64) reduces size with much cheaper encoding and decoding. For small negative
+integers, use sint32 or sint64 instead of int32 or int64.d</p>
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>For proto2, pack repeated numeric fields by annotating them with
+<code>[packed=true]</code>.</p>
+</summary>
+
+<p>In proto2, repeated values are serialized as a sequence of (tag, value) pairs by
+default. This is inefficient because tags have to be decoded for every element.</p>
+<p>Packed repeated primitives are serialized with the length of the payload first
+followed by values without tags. When using fixed-width values, we can avoid
+reallocations by knowing the final size the moment we start parsing; i.e., no
+reallocation cost. We still don't know how many varints are in the payload and
+may have to pay the reallocation cost.</p>
+<p>In proto3, repeated fields are packed by default.</p>
+<p>Packed works best with fixed-width values like fixed32, fixed64, float, double,
+etc. since the entire encoded length can be predetermined by multiplying the
+number of elements by the fixed value size, instead of having to calculate the
+length of each individual element.</p>
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Use <code>bytes</code> instead for <code>string</code> for binary data
+and large values.</p>
+</summary>
+
+<p>The <code>string</code> type holds UTF8-encoded text, and can sometimes require validation.
+The <code>bytes</code> type can hold an arbitrary sequence of bytes (non-text data) and is
+often more appropriate as well as more efficient than <code>string</code>.</p>
+
+</details>
+
+
+ 
+<details class="zippy">
+<summary><p>Consider <code>string_type = VIEW</code> to avoid copying.</p>
+</summary>
+
+<p>Copying a big string or bytes field during parsing is expensive. Such cost can
+often be avoided by marking the field with <code>string_type = VIEW</code>.</p>
+<pre><code class="language-proto">message Image {
+  ...
+  bytes jpeg_encoding = 4 [features.(pb.cpp).string_type=VIEW];
+}
+</code></pre>
+<p>Without the <code>VIEW</code> annotation, when the protocol buffer is parsed, the
+potentially large field contents are copied from the serialized protocol buffer
+to a string object in memory. Depending on the number of string or bytes fields
+and the size of those fields, the overhead of copying can be significant.</p>
+<p>Instead of copying the big binary blobs, routines like
+<code>ParseFromStringWithAliasing</code> use <code>absl::string_view</code> to reference the original
+backing string. Note that the backing string (the serialized protocol buffer)
+must outlive the protocol buffer instance that contains the alias.</p>
+
+</details>
+
+
+
+
+
+<details class="zippy">
+<summary><p>Consider using <code>Cord</code> for large fields to reduce copying
+costs.</p>
+</summary>
+
+<p>Annotating large <code>bytes</code> and <code>string</code> fields with <code>[ctype=CORD]</code> may reduce
+copying costs. This annotation changes the representation of the field from
+<code>std::string</code> to <code>absl::Cord</code>. <code>absl::Cord</code> uses reference counting and
+tree-based storage to reduce copying and appending costs. If a protocol buffer
+is serialized to a cord, parsing a string or bytes field with <code>[ctype=CORD]</code> can
+avoid copying the field contents.</p>
+<pre><code class="language-proto">message Document {
+  ...
+  bytes html = 4 [ctype = CORD];
+}
+</code></pre>
+<p>Performance of a Cord field depends on length distribution and access patterns.
+Use benchmarks to validate such changes.</p>
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Use protobuf arenas in C++ code.</p>
+</summary>
+
+<p>Consider using arenas to save allocation and deallocation costs, especially for
+protobufs containing repeated, string, or message fields.</p>
+<p>Message and string fields are heap-allocated (even if the top-level protocol
+buffer object is stack-allocated). If a protocol buffer message has a lot of sub
+message fields and string fields, allocation and deallocation cost can be
+significant. Arenas amortize allocation costs and makes deallocation virtually
+free. It also improves memory locality by allocating from contiguous chunks of
+memory.</p>
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Keep .proto files small</p>
+</summary>
+
+<p>Do not put too many messages in a single .proto file. Once you rely on anything
+at all from a .proto file, the entire file will get pulled in by the linker even
+if it's mostly unused. This increases build times and binary sizes. You can use
+extensions and <code>Any</code> to avoid creating hard dependencies on big
+.proto files with many message types.</p>
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Consider storing protocol buffers in serialized form, even in memory.</p>
+</summary>
+
+<p>In-memory protobuf objects have a large memory footprint (often 5x the wire
+format size), potentially spread across many cache lines. So if your application
+is going to keep many protobuf objects live for long periods of time, consider
+storing them in serialized form.</p>
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Avoid protobuf map fields.</p>
+</summary>
+
+<p>Protobuf map fields have performance problems that usually outweigh the small
+syntactic convenience they provide. Prefer using non-protobuf maps initialized
+from protobuf contents:</p>
+<p>msg.proto</p>
+<pre><code class="language-proto">map&lt;string, bytes&gt; env_variables = 5;
+</code></pre>
+<pre><code class="language-proto">message Var {
+  string key = 1;
+  bytes value = 2;
+}
+repeated Var env_variables = 5;
+</code></pre>
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Use protobuf message definition with a subset of the fields.</p>
+</summary>
+
+<p>If you want to access only a few fields of a large message type, consider
+defining your own protocol buffer message type that mimics the original type,
+but only defines the fields that you care about. Here's an example:</p>
+<pre><code class="language-proto">message FullMessage {
+  optional int32 field1 = 1;
+  optional BigMessage field2 = 2;
+  optional int32 field3 = 3;
+  repeater AnotherBigMessage field4 = 4;
+  ...
+  optional int32 field100 = 100;
+}
+</code></pre>
+<pre><code class="language-proto">message SubsetMessage {
+  optional int32 field3 = 3;
+  optional int32 field88 = 88;
+}
+</code></pre>
+<p>By parsing a serialized <code>FullMessage</code> into a <code>SubsetMessage</code>, only two out of a
+hundred fields are parsed and others are treated as unknown fields. Consider
+using APIs that discard unknown fields to improve performance even more when
+appropriate.</p>
+
+</details>
+
+
+
+
+
+<details class="zippy">
+<summary><p>Reuse protobuf objects when possible.</p>
+</summary>
+
+<p>Declare protobuf objects outside loops so that their allocated storage can be
+reused across loop iterations.</p>
+
+</details>
+
+
+<!-- TODO: Flesh out the preceding examples, maybe with benchmarks. -->
+
+## C++-Specific advice
+
+### absl::flat_hash_map (and set)
+
+[Absl hash tables](https://abseil.io/docs/cpp/guides/container) usually
+out-perform C++ standard library containers such as `std::map` and
+`std::unordered_map`. 
+
+
+<details class="zippy">
+<summary><p>Speed up LanguageFromCode (use absl::flat_hash_map
+instead of a __gnu_cxx::hash_map).</p>
+</summary>
+
+<p>languages.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">class CodeToLanguage
+    ...
+    : public __gnu_cxx::hash_map&lt;absl::string_view, i18n::languages::Language,
+                                 CodeHash, CodeCompare&gt; {
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">class CodeToLanguage
+    ...
+    : public absl::flat_hash_map&lt;absl::string_view, i18n::languages::Language,
+                                 CodeHash, CodeCompare&gt; {
+</code></pre>
+</div>
+
+<p>Benchmark results:</p>
+<pre><code class="language-{.bench}">name               old time/op  new time/op  delta
+BM_CodeToLanguage  19.4ns ± 1%  10.2ns ± 3%  -47.47%  (p=0.000 n=8+10)
+</code></pre>
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Speed up stats publish/unpublish (an older change, so
+uses dense_hash_map instead of absl::flat_hash_map, which did not exist at the
+time).</p>
+</summary>
+
+<p>publish.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">typedef hash_map&lt;uint64, Publication*&gt; PublicationMap;
+static PublicationMap* publications = NULL;
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">typedef dense_hash_map&lt;uint64, Publication*&gt; PublicationMap;;
+static PublicationMap* publications GUARDED_BY(mu) = NULL;
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Use dense_hash_map instead of hash_map for keeping track of
+SelectServer alarms (would use absl::flat_hash_map today).</p>
+</summary>
+
+<p>alarmer.h</p>
+
+<div class="old">
+<pre><code class="language-c++">typedef hash_map&lt;int, Alarm*&gt; AlarmList;
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">typedef dense_hash_map&lt;int, Alarm*&gt; AlarmList;
+</code></pre>
+</div>
+
+
+</details>
+
+
+### absl::btree_map/absl::btree_set
+
+absl::btree_map and absl::btree_set store multiple entries per tree node. This
+has a number of advantages over ordered C++ standard library containers such as
+`std::map`. First, the pointer overhead of pointing to child tree nodes is often
+significantly reduced. Second, because the entries or key/values are stored
+consecutively in memory for a given btree tree node, cache efficiency is often
+significantly better.
+
+
+<details class="zippy">
+<summary><p>Use btree_set instead of std::set to represent a very heavily used
+work-queue.</p>
+</summary>
+
+<p>register_allocator.h</p>
+<pre><code class="language-c++">using container_type = std::set&lt;WorklistItem&gt;;
+</code></pre>
+<pre><code class="language-c++">using container_type = absl::btree_set&lt;WorklistItem&gt;;
+</code></pre>
+
+</details>
+
+
+### util::bitmap::InlinedBitVector
+
+`util::bitmap::InlinedBitvector` can store short bit-vectors inline, and
+therefore can often be a better choice than `std::vector<bool>` or other bitmap
+types.
+
+
+<details class="zippy">
+<summary><p>Use InlinedBitVector instead of std::vector&lt;bool&gt;, and
+then use FindNextBitSet to find the next item of interest.</p>
+</summary>
+
+<p>block_encoder.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">vector&lt;bool&gt; live_reads(nreads);
+...
+for (int offset = 0; offset &lt; b_.block_width(); offset++) {
+  ...
+  for (int r = 0; r &lt; nreads; r++) {
+    if (live_reads[r]) {
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">util::bitmap::InlinedBitVector&lt;4096&gt; live_reads(nreads);
+...
+for (int offset = 0; offset &lt; b_.block_width(); offset++) {
+  ...
+  for (size_t r = 0; live_reads.FindNextSetBit(&amp;r); r++) {
+    DCHECK(live_reads[r]);
+</code></pre>
+</div>
+
+
+</details>
+
+
+### absl::InlinedVector
+
+absl::InlinedVector stores a small number of elements inline (configurable via
+the second template argument). This enables small vectors up to this number of
+elements to generally have better cache efficiency and also to avoid allocating
+a backing store array at all when the number of elements is small.
+
+
+<details class="zippy">
+<summary><p>Use InlinedVector instead of std::vector in various places.</p>
+</summary>
+
+<p>bundle.h</p>
+<pre><code class="language-c++">class Bundle {
+ public:
+ ...
+ private:
+  // Sequence of (slotted instruction, unslotted immediate operands).
+  std::vector&lt;InstructionRecord&gt; instructions_;
+  ...
+};
+</code></pre>
+<pre><code class="language-c++">class Bundle {
+ public:
+ ...
+ private:
+  // Sequence of (slotted instruction, unslotted immediate operands).
+  absl::InlinedVector&lt;InstructionRecord, 2&gt; instructions_;
+  ...
+};
+</code></pre>
+
+</details>
+
+
+### gtl::vector32
+
+Saves space by using a customized vector type that only supports sizes that fit
+in 32 bits.
+
+
+<details class="zippy">
+<summary><p>Simple type change saves ~8TiB of memory in Spanner.</p>
+</summary>
+
+<p>table_ply.h</p>
+
+<div class="old">
+<pre><code class="language-c++">class TablePly {
+    ...
+    // Returns the set of data columns stored in this file for this table.
+    const std::vector&lt;FamilyId&gt;&amp; modified_data_columns() const {
+      return modified_data_columns_;
+    }
+    ...
+   private:
+    ...
+    std::vector&lt;FamilyId&gt; modified_data_columns_;  // Data columns in the table.
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">#include &quot;util/gtl/vector32.h&quot;
+    ...
+    // Returns the set of data columns stored in this file for this table.
+    absl::Span&lt;const FamilyId&gt; modified_data_columns() const {
+      return modified_data_columns_;
+    }
+    ...
+
+    ...
+    // Data columns in the table.
+    gtl::vector32&lt;FamilyId&gt; modified_data_columns_;
+</code></pre>
+</div>
+
+
+</details>
+
+
+### gtl::small_map
+
+gtl::small_map uses an inline array to store up to a certain number of unique
+key-value-pair elements, but upgrades itself automatically to be backed by a
+user-specified map type when it runs out of space.
+
+
+<details class="zippy">
+<summary><p>Use gtl::small_map in tflite_model.</p>
+</summary>
+
+<p>tflite_model.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">using ChoiceIdToContextMap = gtl::flat_hash_map&lt;int, TFLiteContext*&gt;;
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">using ChoiceIdToContextMap =
+    gtl::small_map&lt;gtl::flat_hash_map&lt;int, TFLiteContext*&gt;&gt;;
+</code></pre>
+</div>
+
+
+</details>
+
+
+### gtl::small_ordered_set
+
+gtl::small_ordered_set is an optimization for associative containers (such as
+std::set or absl::btree_multiset). It uses a fixed array to store a certain
+number of elements, then reverts to using a set or multiset when it runs out of
+space. For sets that are typically small, this can be considerably faster than
+using something like set directly, as set is optimized for large data sets. This
+change shrinks cache footprint and reduces critical section length.
+
+
+<details class="zippy">
+<summary><p>Use gtl::small_ordered_set to hold set of listeners.</p>
+</summary>
+
+<p>broadcast_stream.h</p>
+
+<div class="old">
+<pre><code class="language-c++">class BroadcastStream : public ParsedRtpTransport {
+ ...
+ private:
+  ...
+  std::set&lt;ParsedRtpTransport*&gt; listeners_ ABSL_GUARDED_BY(listeners_mutex_);
+};
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">class BroadcastStream : public ParsedRtpTransport {
+ ...
+ private:
+  ...
+  using ListenersSet =
+      gtl::small_ordered_set&lt;std::set&lt;ParsedRtpTransport*&gt;, 10&gt;;
+  ListenersSet listeners_ ABSL_GUARDED_BY(listeners_mutex_);
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+
+### gtl::intrusive_list {#gtl-intrusive_list}
+
+`gtl::intrusive_list<T>` is a doubly-linked list where the link pointers are
+embedded in the elements of type T. It saves one cache line+indirection per
+element when compared to `std::list<T*>`.
+
+
+<details class="zippy">
+<summary><p>Use intrusive_list to keep track of inflight requests for
+each index row update.</p>
+</summary>
+
+<p>row-update-sender-inflight-set.h</p>
+
+<div class="old">
+<pre><code class="language-c++">std::set&lt;int64&gt; inflight_requests_ GUARDED_BY(mu_);
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">class SeqNum : public gtl::intrusive_link&lt;SeqNum&gt; {
+  ...
+  int64 val_ = -1;
+  ...
+};
+...
+gtl::intrusive_list&lt;SeqNum&gt; inflight_requests_ GUARDED_BY(mu_);
+</code></pre>
+</div>
+
+
+</details>
+
+
+### Limit absl::Status and absl::StatusOr usage
+
+Even though `absl::Status` and `absl::StatusOr` types are fairly efficient, they
+have a non-zero overhead even in the success path and should therefore be
+avoided for hot routines that don't need to return any meaningful error details
+(or perhaps never even fail!):
+
+
+<details class="zippy">
+<summary><p>Avoid StatusOr&lt;int64&gt; return type for
+RoundUpToAlignment() function.</p>
+</summary>
+
+<p>best_fit_allocator.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">absl::StatusOr&lt;int64&gt; BestFitAllocator::RoundUpToAlignment(int64 bytes) const {
+  TPU_RET_CHECK_GE(bytes, 0);
+
+  const int64 max_aligned = MathUtil::RoundDownTo&lt;int64&gt;(
+      std::numeric_limits&lt;int64&gt;::max(), alignment_in_bytes_);
+  if (bytes &gt; max_aligned) {
+    return util::ResourceExhaustedErrorBuilder(ABSL_LOC)
+           &lt;&lt; &quot;Attempted to allocate &quot;
+           &lt;&lt; strings::HumanReadableNumBytes::ToString(bytes)
+           &lt;&lt; &quot; which after aligning to &quot;
+           &lt;&lt; strings::HumanReadableNumBytes::ToString(alignment_in_bytes_)
+           &lt;&lt; &quot; cannot be expressed as an int64.&quot;;
+  }
+
+  return MathUtil::RoundUpTo&lt;int64&gt;(bytes, alignment_in_bytes_);
+}
+</code></pre>
+</div>
+
+<p>best_fit_allocator.h</p>
+
+<div class="new">
+<pre><code class="language-c++">// Rounds bytes up to nearest multiple of alignment_.
+// REQUIRES: bytes &gt;= 0.
+// REQUIRES: result does not overflow int64.
+// REQUIRES: alignment_in_bytes_ is a power of 2 (checked in constructor).
+int64 RoundUpToAlignment(int64 bytes) const {
+  DCHECK_GE(bytes, 0);
+  DCHECK_LE(bytes, max_aligned_bytes_);
+  int64 result =
+      ((bytes + (alignment_in_bytes_ - 1)) &amp; ~(alignment_in_bytes_ - 1));
+  DCHECK_EQ(result, MathUtil::RoundUpTo&lt;int64&gt;(bytes, alignment_in_bytes_));
+  return result;
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Add ShapeUtil::ForEachIndexNoStatus to avoid creating a
+Status return object for every element of a tensor.</p>
+</summary>
+
+<p>shape_util.h</p>
+
+<div class="old">
+<pre><code class="language-c++">using ForEachVisitorFunction =
+    absl::FunctionRef&lt;StatusOr&lt;bool&gt;(absl::Span&lt;const int64_t&gt;)&gt;;
+    ...
+static void ForEachIndex(const Shape&amp; shape, absl::Span&lt;const int64_t&gt; base,
+                         absl::Span&lt;const int64_t&gt; count,
+                         absl::Span&lt;const int64_t&gt; incr,
+                         const ForEachVisitorFunction&amp; visitor_function);
+
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">using ForEachVisitorFunctionNoStatus =
+    absl::FunctionRef&lt;bool(absl::Span&lt;const int64_t&gt;)&gt;;
+    ...
+static void ForEachIndexNoStatus(
+    const Shape&amp; shape, absl::Span&lt;const int64_t&gt; base,
+    absl::Span&lt;const int64_t&gt; count, absl::Span&lt;const int64_t&gt; incr,
+    const ForEachVisitorFunctionNoStatus&amp; visitor_function);
+</code></pre>
+</div>
+
+<p>literal.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">ShapeUtil::ForEachIndex(
+    result_shape, [&amp;](absl::Span&lt;const int64_t&gt; output_index) {
+      for (int64_t i = 0, end = dimensions.size(); i &lt; end; ++i) {
+        scratch_source_index[i] = output_index[dimensions[i]];
+      }
+      int64_t dest_index = IndexUtil::MultidimensionalIndexToLinearIndex(
+          result_shape, output_index);
+      int64_t source_index = IndexUtil::MultidimensionalIndexToLinearIndex(
+          shape(), scratch_source_index);
+      memcpy(dest_data + primitive_size * dest_index,
+             source_data + primitive_size * source_index, primitive_size);
+      return true;
+    });
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">ShapeUtil::ForEachIndexNoStatus(
+    result_shape, [&amp;](absl::Span&lt;const int64_t&gt; output_index) {
+      // Compute dest_index
+      int64_t dest_index = IndexUtil::MultidimensionalIndexToLinearIndex(
+          result_shape, result_minor_to_major, output_index);
+
+      // Compute source_index
+      int64_t source_index;
+      for (int64_t i = 0, end = dimensions.size(); i &lt; end; ++i) {
+        scratch_source_array[i] = output_index[dimensions[i]];
+      }
+      if (src_shape_dims == 1) {
+        // Fast path for this case
+        source_index = scratch_source_array[0];
+        DCHECK_EQ(source_index,
+                  IndexUtil::MultidimensionalIndexToLinearIndex(
+                      src_shape, src_minor_to_major, scratch_source_span));
+      } else {
+        source_index = IndexUtil::MultidimensionalIndexToLinearIndex(
+            src_shape, src_minor_to_major, scratch_source_span);
+      }
+      // Move one element from source_index in source to dest_index in dest
+      memcpy(dest_data + PRIMITIVE_SIZE * dest_index,
+             source_data + PRIMITIVE_SIZE * source_index, PRIMITIVE_SIZE);
+      return true;
+    });
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>In TF_CHECK_OK, avoid creating Ok object in order to test
+for ok().</p>
+</summary>
+
+<p>status.h</p>
+
+<div class="old">
+<pre><code class="language-c++">#define TF_CHECK_OK(val) CHECK_EQ(::tensorflow::Status::OK(), (val))
+#define TF_QCHECK_OK(val) QCHECK_EQ(::tensorflow::Status::OK(), (val))
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">extern tensorflow::string* TfCheckOpHelperOutOfLine(
+    const ::tensorflow::Status&amp; v, const char* msg);
+inline tensorflow::string* TfCheckOpHelper(::tensorflow::Status v,
+                                           const char* msg) {
+  if (v.ok()) return nullptr;
+  return TfCheckOpHelperOutOfLine(v, msg);
+}
+#define TF_CHECK_OK(val)                                           \
+  while (tensorflow::string* _result = TfCheckOpHelper(val, #val)) \
+  LOG(FATAL) &lt;&lt; *(_result)
+#define TF_QCHECK_OK(val)                                          \
+  while (tensorflow::string* _result = TfCheckOpHelper(val, #val)) \
+  LOG(QFATAL) &lt;&lt; *(_result)
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Remove StatusOr from the hot path of remote procedure
+calls (RPCs).</p>
+</summary>
+
+<p>Removal of StatusOr from a hot path eliminated a 14% CPU regression in RPC
+benchmarks caused by an earlier change.</p>
+<p>privacy_context.h</p>
+
+<div class="old">
+<pre><code class="language-c++">absl::StatusOr&lt;privacy::context::PrivacyContext&gt; GetRawPrivacyContext(
+    const CensusHandle&amp; h);
+</code></pre>
+</div>
+
+<p>privacy_context_statusfree.h</p>
+
+<div class="new">
+<pre><code class="language-c++">enum class Result {
+  kSuccess,
+  kNoRootScopedData,
+  kNoPrivacyContext,
+  kNoDDTContext,
+  kDeclassified,
+  kNoPrequestContext
+};
+...
+Result GetRawPrivacyContext(const CensusHandle&amp; h,
+                            PrivacyContext* privacy_context);
+</code></pre>
+</div>
+
+
+</details>
+
+
+## Bulk operations {#bulk-operations}
+
+If possible, handle many items at once rather than just one at a time.
+
+
+<details class="zippy">
+<summary><p>absl::flat_hash_map compares one hash byte per key from a
+group of keys using a single SIMD instruction.</p>
+</summary>
+
+<p>See <a href="https://abseil.io/about/design/swisstables">Swiss Table Design Notes</a> and
+related <a href="https://www.youtube.com/watch?v=ncHmEUmJZf4">CppCon 2017</a> and
+<a href="https://www.youtube.com/watch?v=JZE3_0qvrMg">CppCon 2019</a> talks by Matt
+Kulukundis.</p>
+<p>raw_hash_set.h</p>
+
+<div class="new">
+<pre><code class="language-c++">// Returns a bitmask representing the positions of slots that match hash.
+BitMask&lt;uint32_t&gt; Match(h2_t hash) const {
+  auto ctrl = _mm_loadu_si128(reinterpret_cast&lt;const __m128i*&gt;(pos));
+  auto match = _mm_set1_epi8(hash);
+  return BitMask&lt;uint32_t&gt;(_mm_movemask_epi8(_mm_cmpeq_epi8(match, ctrl)));
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Do single operations to deal with many bytes and fix
+things up, rather than checking every byte what to do.</p>
+</summary>
+
+<p>ordered-code.cc</p>
+
+<div class="old">
+<pre><code class="language-c++">int len = 0;
+while (val &gt; 0) {
+  len++;
+  buf[9 - len] = (val &amp; 0xff);
+  val &gt;&gt;= 8;
+}
+buf[9 - len - 1] = (unsigned char)len;
+len++;
+FastStringAppend(dest, reinterpret_cast&lt;const char*&gt;(buf + 9 - len), len);
+</code></pre>
+</div>
+
+<div class="new">
+<pre><code class="language-c++">BigEndian::Store(val, buf + 1);  // buf[0] may be needed for length
+const unsigned int length = OrderedNumLength(val);
+char* start = buf + 9 - length - 1;
+*start = length;
+AppendUpto9(dest, start, length + 1);
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Improve Reed-Solomon processing speed by handling
+multiple interleaved input buffers more efficiently in chunks.</p>
+</summary>
+
+<pre><code class="language-{.bench}">Run on (12 X 3501 MHz CPUs); 2016-09-27T16:04:55.065995192-04:00
+CPU: Intel Haswell with HyperThreading (6 cores) dL1:32KB dL2:256KB dL3:15MB
+Benchmark                          Base (ns)  New (ns) Improvement
+------------------------------------------------------------------
+BM_OneOutput/3/2                      466867    351818    +24.6%
+BM_OneOutput/4/2                      563130    474756    +15.7%
+BM_OneOutput/5/3                      815393    688820    +15.5%
+BM_OneOutput/6/3                      897246    780539    +13.0%
+BM_OneOutput/8/4                     1270489   1137149    +10.5%
+BM_AllOutputs/3/2                     848772    642942    +24.3%
+BM_AllOutputs/4/2                    1067647    638139    +40.2%
+BM_AllOutputs/5/3                    1739135   1151369    +33.8%
+BM_AllOutputs/6/3                    2045817   1456744    +28.8%
+BM_AllOutputs/8/4                    3012958   2484937    +17.5%
+BM_AllOutputsSetUpOnce/3/2            717310    493371    +31.2%
+BM_AllOutputsSetUpOnce/4/2            833866    600060    +28.0%
+BM_AllOutputsSetUpOnce/5/3           1537870   1137357    +26.0%
+BM_AllOutputsSetUpOnce/6/3           1802353   1398600    +22.4%
+BM_AllOutputsSetUpOnce/8/4           3166930   2455973    +22.4%
+</code></pre>
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Decode four integers at a time (circa 2004).</p>
+</summary>
+
+<p>Introduced a
+<a href="https://static.googleusercontent.com/media/research.google.com/en//people/jeff/WSDM09-keynote.pdf">GroupVarInt format</a>
+that encodes/decodes groups of 4 variable-length integers at a time in 5-17
+bytes, rather than one integer at a time. Decoding one group of 4 integers in
+the new format takes ~1/3rd the time of decoding 4 individually varint-encoded
+integers.</p>
+<p>groupvarint.cc</p>
+
+<div class="new">
+<pre><code class="language-c++">const char* DecodeGroupVar(const char* p, int N, uint32* dest) {
+  assert(groupvar_initialized);
+  assert(N % 4 == 0);
+  while (N) {
+    uint8 tag = *p;
+    p++;
+
+    uint8* lenptr = &amp;groupvar_table[tag].length[0];
+
+#define GET_NEXT                                        \
+    do {                                                \
+      uint8 len = *lenptr;                              \
+      *dest = UNALIGNED_LOAD32(p) &amp; groupvar_mask[len]; \
+      dest++;                                           \
+      p += len;                                         \
+      lenptr++;                                         \
+    } while (0)
+    GET_NEXT;
+    GET_NEXT;
+    GET_NEXT;
+    GET_NEXT;
+#undef GET_NEXT
+
+    N -= 4;
+  }
+  return p;
+}
+</code></pre>
+</div>
+
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Encode groups of 4 k-bit numbers at a time.</p>
+</summary>
+
+<p>Added KBitStreamEncoder and KBitStreamDecoder classes to encode/decode 4 k-bit
+numbers at a time into a bit stream. Since K is known at compile time, the
+encoding and decoding can be quite efficient. E.g., since four numbers are
+encoded at a time, the code can assume that the stream is always byte-aligned
+(for even k), or nibble-aligned (for odd k).</p>
+
+</details>
+
+
+## CLs that demonstrate multiple techniques {#cls-that-demonstrate-multiple-techniques}
+
+Sometimes a single CL contains a number of performance-improving changes that
+use many of the preceding techniques. Looking at the kinds of changes in these
+CLs is sometimes a good way to get in the mindset of making general changes to
+speed up the performance of some part of a system after that has been identified
+as a bottleneck.
+
+
+<details class="zippy">
+<summary><p>Speed up GPU memory allocator by ~40%.</p>
+</summary>
+
+<p>36-48% speedup in allocation/deallocation speed for GPUBFCAllocator:</p>
+<ol>
+<li>
+<p>Identify chunks by a handle number, rather than by a pointer to a Chunk.
+Chunk data structures are now allocated in a <code>vector&lt;Chunk&gt;</code>, and a handle
+is an index into this vector to refer to a particular chunk. This allows the
+next and prev pointers in Chunk to be ChunkHandle (4 bytes), rather than
+<code>Chunk*</code> (8 bytes).</p>
+</li>
+<li>
+<p>When a Chunk object is no longer in use, we maintain a free list of Chunk
+objects, whose head is designated by ChunkHandle <code>free_chunks_list_</code>, and
+with the <code>Chunk-&gt;next</code> pointing to the next free list entry. Together with
+(1), this allows us to avoid heap allocation/deallocation of Chunk objects
+in the allocator, except (rarely) when the <code>vector&lt;Chunk&gt;</code> grows. It also
+makes all the memory for Chunk objects contiguous.</p>
+</li>
+<li>
+<p>Rather than having the bins_ data structure be a std::set and using
+lower_bound to locate the appropriate bin given a byte_size, we instead have
+an array of bins, indexed by a function that is log₂(byte_size/256). This
+allows the bin to be located with a few bit operations, rather than a binary
+search tree lookup. It also allows us to allocate the storage for all the
+Bin data structures in a contiguous array, rather than in many different
+cache lines. This reduces the number of cache lines that must be moved
+around between cores when multiple threads are doing allocations.</p>
+</li>
+<li>
+<p>Added fast path to GPUBFCAllocator::AllocateRaw that first tries to allocate
+memory without involving the retry_helper_. If an initial attempt fails
+(returns nullptr), then we go through the retry_helper_, but normally we can
+avoid several levels of procedure calls as well as the
+allocation/deallocation of a std::function with several arguments.</p>
+</li>
+<li>
+<p>Commented out most of the VLOG calls. These can be reenabled selectively
+when needed for debugging purposes by uncommenting and recompiling.</p>
+</li>
+</ol>
+<p>Added multi-threaded benchmark to test allocation under contention.</p>
+<p>Speeds up ptb_word_lm on my desktop machine with a Titan X card from 8036 words
+per second to 8272 words per second (+2.9%).</p>
+<pre><code class="language-{.bench}">Run on (40 X 2801 MHz CPUs); 2016/02/16-15:12:49
+CPU: Intel Ivybridge with HyperThreading (20 cores) dL1:32KB dL2:256KB dL3:25MB
+Benchmark                          Base (ns)  New (ns) Improvement
+------------------------------------------------------------------
+BM_Allocation                            347       184    +47.0%
+BM_AllocationThreaded/1                  351       181    +48.4%
+BM_AllocationThreaded/4                 2470      1975    +20.0%
+BM_AllocationThreaded/16               11846      9507    +19.7%
+BM_AllocationDelayed/1                   392       199    +49.2%
+BM_AllocationDelayed/10                  285       169    +40.7%
+BM_AllocationDelayed/100                 245       149    +39.2%
+BM_AllocationDelayed/1000                238       151    +36.6%
+</code></pre>
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Speed up Pathways throughput by ~20% via a set of
+miscellaneous changes.</p>
+</summary>
+
+<ul>
+<li>
+<p>Unified a bunch of special fast descriptor parsing functions into a single
+ParsedDescriptor class and use this class in more places to avoid expensive
+full parse calls.</p>
+</li>
+<li>
+<p>Change several protocol buffer fields from string to bytes (avoids
+unnecessary utf-8 checks and associated error handling code).</p>
+</li>
+<li>
+<p>DescriptorProto.inlined_contents is now a string, not a Cord (it is expected
+to be used only for small-ish tensors). This necessitated the addition of a
+bunch of copying helpers in tensor_util.cc (need to now support both strings
+and Cords).</p>
+</li>
+<li>
+<p>Use flat_hash_map instead of std::unordered_map in a few places.</p>
+</li>
+<li>
+<p>Added MemoryManager::LookupMany for use by Stack op instead of calling
+Lookup per batch element. This change reduces setup overhead like locking.</p>
+</li>
+<li>
+<p>Removed some unnecessary string creation in TransferDispatchOp.</p>
+</li>
+<li>
+<p>Performance results for transferring a batch of 1000 1KB tensors from one
+component to another in the same process:</p>
+</li>
+</ul>
+<pre><code class="language-{.bench}">Before: 227.01 steps/sec
+After:  272.52 steps/sec (+20% throughput)
+</code></pre>
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>~15% XLA compiler performance improvement through a
+series of changes.</p>
+</summary>
+
+<p>Some changes to speed up XLA compilation:</p>
+<ol>
+<li>
+<p>In SortComputationsByContent, return false if a == b in comparison function,
+to avoid serializing and fingerprinting long computation strings.</p>
+</li>
+<li>
+<p>Turn CHECK into DCHECK to avoid touching an extra cache line in
+HloComputation::ComputeInstructionPostOrder</p>
+</li>
+<li>
+<p>Avoid making an expensive copy of the front instruction in
+CoreSequencer::IsVectorSyncHoldSatisfied().</p>
+</li>
+<li>
+<p>Rework 2-argument HloComputation::ToString and HloComputation::ToCord
+routines to do the bulk of the work in terms of appending to std::string,
+rather than appending to a Cord.</p>
+</li>
+<li>
+<p>Change PerformanceCounterSet::Increment to just do a single hash table
+lookup rather than two.</p>
+</li>
+<li>
+<p>Streamline Scoreboard::Update code</p>
+</li>
+</ol>
+<p>Overall speedup of 14% in XLA compilation time for one important
+model.</p>
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Speed up low level logging in Google Meet application
+code.</p>
+</summary>
+
+<p>Speed up ScopedLogId, which is on the critical path for each packet.</p>
+<ul>
+<li>Removed the <code>LOG_EVERY_N(ERROR, ...)</code> messages that seemed to be there only
+to see if invariants were violated.</li>
+<li>Inlined the PushLogId and PopLogid() routines (since without the
+<code>LOG_EVERY_N_SECONDS(ERROR, ...)</code> statements, they are now small enough to
+inline.</li>
+<li>Switched to using a fixed array of size 4 and an 'int size' variable instead
+of an <code>InlinedVector&lt;...&gt;</code> for maintaining the thread local state. Since we
+never were growing beyond size 4 anyway, the InlinedVector's functionality
+was more general than needed.</li>
+</ul>
+<pre><code class="language-{.bench}">Base: Baseline plus the code in scoped_logid_test.cc to add the benchmark
+New: This changelist
+
+CPU: Intel Ivybridge with HyperThreading (20 cores) dL1:32KB dL2:256KB dL3:25MB
+Benchmark                                      Base (ns)    New (ns) Improvement
+----------------------------------------------------------------------------
+BM_ScopedLogId/threads:1                               8           4    +52.6%
+BM_ScopedLogId/threads:2                               8           4    +51.9%
+BM_ScopedLogId/threads:4                               8           4    +52.9%
+BM_ScopedLogId/threads:8                               8           4    +52.1%
+BM_ScopedLogId/threads:16                             11           6    +44.0%
+
+</code></pre>
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Reduce XLA compilation time by ~31% by improving Shape
+handling.</p>
+</summary>
+
+<p>Several changes to improve XLA compiler performance:</p>
+<ol>
+<li>
+<p>Improved performance of ShapeUtil::ForEachIndex... iteration in a few ways:</p>
+<ul>
+<li>
+<p>In ShapeUtil::ForEachState, save just pointers to the arrays represented
+by the spans, rather than the full span objects.</p>
+</li>
+<li>
+<p>Pre-form a ShapeUtil::ForEachState::indexes_span pointing at the
+ShapeUtil::ForEachState::indexes vector, rather than constructing this
+span from the vector on every loop iteration.</p>
+</li>
+<li>
+<p>Save a ShapeUtil::ForEachState::indexes_ptr pointer to the backing store
+of the ShapeUtil::ForEachState::indexes vector, allowing simple array
+operations in ShapeUtil::ForEachState::IncrementDim(), rather than more
+expensive vector::operator[] operations.</p>
+</li>
+<li>
+<p>Save a ShapeUtil::ForEachState::minor_to_major array pointer initialized
+in the constructor by calling shape.layout().minor_to_major().data()
+rather than calling LayoutUtil::Minor(...) for each dimension for each
+iteration.</p>
+</li>
+<li>
+<p>Inlined the ShapeUtil::ForEachState constructor and the
+ShapeUtil::ForEachState::IncrementDim() routines</p>
+</li>
+</ul>
+</li>
+<li>
+<p>Improved the performance of ShapeUtil::ForEachIndex iteration for call sites
+that don't need the functionality of returning a Status in the passed in
+function. Did this by introducing ShapeUtil::ForEachIndexNoStatus variants,
+which accept a ForEachVisitorFunctionNoStatus (which returns a plain bool).
+This is faster than the ShapeUtil::ForEachIndex routines, which accept a
+ForEachVisitorFunction (which returns a <code>StatusOr&lt;bool&gt;</code>, which requires an
+expensive <code>StatusOr&lt;bool&gt;</code> destructor call per element that we iterate
+over).</p>
+<ul>
+<li>Used this variant of ShapeUtil::ForEachIndexNoStatus in
+LiteralBase::Broadcast and GenerateReduceOutputElement.</li>
+</ul>
+</li>
+<li>
+<p>Improved performance of LiteralBase::Broadcast in several ways:</p>
+<ul>
+<li>
+<p>Introduced templated BroadcastHelper routine in literal.cc that is
+specialized for different primitive byte sizes (without this,
+primitive_size was a runtime variable and so the compiler couldn't do a
+very good job of optimizing the memcpy that occurred per element, and
+would invoke the general memcpy path that assumes the byte count is
+fairly large, even though in our case it is a tiny power of 2 (typically
+1, 2, 4, or 8)).</p>
+</li>
+<li>
+<p>Avoided all but one of ~(5 + num_dimensions + num_result_elements)
+virtual calls per Broadcast call by making a single call to 'shape()' at
+the beginning of the LiteralBase::Broadcast routine. The innocuous
+looking 'shape()' calls that were sprinkled throughout end up boiling
+down to &quot;root_piece().subshape()&quot;, where subshape() is a virtual
+function.</p>
+</li>
+<li>
+<p>In the BroadcastHelper routine, Special-cased the source dimensions
+being one and avoided a call to
+IndexUtil::MultiDimensionalIndexToLinearIndex for this case.</p>
+</li>
+<li>
+<p>In BroadcastHelper, used a scratch_source_array pointer variable that
+points into the backing store of the scratch_source_index vector, and
+used that directly to avoid vector::operator[] operations inside the
+per-element code. Also pre-computed a scratch_source_span that points to
+the scratch_source_index vector outside the per-element loop in
+BroadcastHelper, to avoid constructing a span from the vector on each
+element.</p>
+</li>
+<li>
+<p>Introduced new three-argument variant of
+IndexUtil::MultiDimensionalIndexToLinearIndex where the caller passes in
+the minor_to_major span associated with the shape argument. Used this in
+BroadcastHelper to compute this for the src and dst shapes once per
+Broadcast, rather than once per element copied.</p>
+</li>
+</ul>
+</li>
+<li>
+<p>In ShardingPropagation::GetShardingFromUser, for the HloOpcode::kTuple case,
+only call user.sharding().GetSubSharding(...) if we have found the operand
+to be of interest. Avoiding calling it eagerly reduces CPU time in this
+routine for one lengthy compilation from 43.7s to 2.0s.</p>
+</li>
+<li>
+<p>Added benchmarks for ShapeUtil::ForEachIndex and Literal::Broadcast and for
+the new ShapeUtil::ForEachIndexNoStatus.</p>
+</li>
+</ol>
+<pre><code class="language-{.bench}">Base is with the benchmark additions of
+BM_ForEachIndex and BM_BroadcastVectorToMatrix (and BUILD file change to add
+benchmark dependency), but no other changes.
+
+New is this cl
+
+Run on (72 X 1357.56 MHz CPU s) CPU Caches: L1 Data 32 KiB (x36)
+L1 Instruction 32 KiB (x36) L2 Unified 1024 KiB (x36) L3 Unified 25344 KiB (x2)
+
+Benchmark                                      Base (ns)    New (ns) Improvement
+----------------------------------------------------------------------------
+BM_MakeShape                                       18.40       18.90     -2.7%
+BM_MakeValidatedShape                              35.80       35.60     +0.6%
+BM_ForEachIndex/0                                  57.80       55.80     +3.5%
+BM_ForEachIndex/1                                  90.90       85.50     +5.9%
+BM_ForEachIndex/2                               1973606     1642197     +16.8%
+</code></pre>
+<p>The newly added ForEachIndexNoStatus is considerably faster than the
+ForEachIndex variant (it only exists in this new cl, but the benchmark work that
+is done by BM_ForEachIndexNoStatus/NUM is comparable to the BM_ForEachIndex/NUM
+results above).</p>
+<pre><code class="language-{.bench}">Benchmark                                      Base (ns)    New (ns) Improvement
+----------------------------------------------------------------------------
+BM_ForEachIndexNoStatus/0                             0        46.90    ----
+BM_ForEachIndexNoStatus/1                             0        65.60    ----
+BM_ForEachIndexNoStatus/2                             0     1001277     ----
+</code></pre>
+<p>Broadcast performance improves by ~58%.</p>
+<pre><code class="language-{.bench}">Benchmark                                      Base (ns)    New (ns) Improvement
+----------------------------------------------------------------------------
+BM_BroadcastVectorToMatrix/16/16                   5556        2374     +57.3%
+BM_BroadcastVectorToMatrix/16/1024               319510      131075     +59.0%
+BM_BroadcastVectorToMatrix/1024/1024           20216949     8408188     +58.4%
+</code></pre>
+<p>Macro results from doing ahead-of-time compilation of a large language model
+(program does more than just the XLA compilation, but spends a bit less than
+half its time in XLA-related code):</p>
+<p>Baseline program overall: 573 seconds With this cl program overall: 465 seconds
+(+19% improvement)</p>
+<p>Time spent in compiling the two largest XLA programs in running this program:</p>
+<p>Baseline: 141s + 143s = 284s With this CL: 99s + 95s = 194s (+31% improvement)</p>
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Reduce compilation time for large programs by ~22% in
+Plaque (a distributed execution framework).</p>
+</summary>
+
+<p>Small tweaks to speed up compilation by ~22%.</p>
+<ol>
+<li>Speed up detection of whether or not two nodes share a common source.
+Previously, we would get the sources for each node in sorted order and then
+do a sorted intersection. We now place the sources for one node in a
+hash-table and then iterate over the other node's sources checking the
+hash-table.</li>
+<li>Reuse the same scratch hash-table in step 1.</li>
+<li>When generating compiled proto, keep a single btree keyed by <code>pair&lt;package, opname&gt;</code> instead of a btree of btrees.</li>
+<li>Store pointer to opdef in the preceding btree instead of copying the opdef
+into the btree.</li>
+</ol>
+<p>Measurement of speed on large programs (~45K ops):</p>
+<pre><code class="language-{.bench}">name             old time/op  new time/op  delta
+BM_CompileLarge   28.5s ± 2%   22.4s ± 2%  -21.61%  (p=0.008 n=5+5)
+</code></pre>
+
+</details>
+
+
+
+
+
+
+
+<details class="zippy">
+<summary><p>MapReduce improvements (~2X speedup for wordcount
+benchmark).</p>
+</summary>
+
+<p>Mapreduce speedups:</p>
+<ol>
+<li>
+<p>The combiner data structures for the SafeCombinerMapOutput class have been
+changed. Rather than using a <code>hash_multimap&lt;SafeCombinerKey, StringPiece&gt;</code>,
+which had a hash table entry for each unique key/value inserted in the
+table, we instead use a <code>hash_map&lt;SafeCombinerKey, ValuePtr*&gt;</code> (where
+ValuePtr is a linked list of values and repetition counts). This helps in
+three ways:</p>
+<ul>
+<li>
+<p>It significantly reduces memory usage, since we only use
+&quot;sizeof(ValuePtr) + value_len&quot; bytes for each value, rather than
+&quot;sizeof(SafeCombinerKey) + sizeof(StringPiece) + value_len + new hash
+table entry overhead&quot; for each value. This means that we flush the
+reducer buffer less often.</p>
+</li>
+<li>
+<p>It's significantly faster, since we avoid extra hash table entries when
+we're inserting a new value for a key that already exists in the table
+(and instead we just hook the value into the linked list of values for
+that key).</p>
+</li>
+<li>
+<p>Since we associate a repetition count with each value in the linked
+list, we can represent this sequence:</p>
+<pre><code class="language-c++">Output(key, &quot;1&quot;);
+Output(key, &quot;1&quot;);
+Output(key, &quot;1&quot;);
+Output(key, &quot;1&quot;);
+Output(key, &quot;1&quot;);
+</code></pre>
+</li>
+</ul>
+<p>as a single entry in the linked list for &quot;key&quot; with a repetition count of 5.
+Internally we yield &quot;1&quot; five times to the user-level combining function. (A
+similar trick could be applied on the reduce side, perhaps).</p>
+</li>
+<li>
+<p>(Minor) Added a test for &quot;nshards == 1&quot; to the default
+MapReductionBase::KeyFingerprintSharding function that avoids fingerprinting
+the key entirely if we are just using 1 reduce shard (since we can just
+return 0 directly in that case without examining the key).</p>
+</li>
+<li>
+<p>Turned some VLOG(3) statements into DVLOG(3) in the code path that is called
+for each key/value added to the combiner.</p>
+</li>
+</ol>
+<p>Reduces time for one wordcount benchmark from 12.56s to 6.55s.</p>
+
+</details>
+
+
+
+<details class="zippy">
+<summary><p>Rework the alarm handling code in the SelectServer to
+significantly improve its performance (adding+removing an alarm from 771 ns to
+271 ns).</p>
+</summary>
+
+<p>Reworked the alarm handling code in the SelectServer to significantly improve
+its performance.</p>
+<p>Changes:</p>
+<ol>
+<li>
+<p>Switched to using <code>AdjustablePriorityQueue&lt;Alarm&gt;</code> instead of a a
+<code>set&lt;Alarm*&gt;</code> for the <code>AlarmQueue</code>. This significantly speeds up alarm
+handling, reducing the time taken to add and remove an alarm from 771
+nanoseconds to 281 nanoseconds. This change avoids an
+allocation/deallocation per alarm setup (for the red-black tree node in the
+STL set object), and also gives much better cache locality (since the
+AdjustablePriorityQueue is a heap implemented in a vector, rather than a
+red-black tree), there are fewer cache lines touched when manipulating the
+<code>AlarmQueue</code> on every trip through the selectserver loop.</p>
+</li>
+<li>
+<p>Converted AlarmList in Alarmer from a hash_map to a dense_hash_map to avoid
+another allocation/deallocation per alarm addition/deletion (this also
+improves cache locality when adding/removing alarms).</p>
+</li>
+<li>
+<p>Removed the <code>num_alarms_stat_</code> and <code>num_closures_stat_</code>
+MinuteTenMinuteHourStat objects, and the corresponding exported variables.
+Although monitoring these seems nice, in practice they add significant
+overhead to critical networking code. If I had left these variables in as
+Atomic32 variables instead of MinuteTenMinuteHourStat, they would have still
+increased the cost of adding and removing alarms from 281 nanoseconds to 340
+nanoseconds.</p>
+</li>
+</ol>
+<p>Benchmark results</p>
+<pre><code class="language-{.old}">Benchmark                      Time(ns)  CPU(ns) Iterations
+-----------------------------------------------------------
+BM_AddAlarm/1                       902      771     777777
+</code></pre>
+<p>With this change</p>
+<pre><code class="language-{.new}">Benchmark                      Time(ns)  CPU(ns) Iterations
+-----------------------------------------------------------
+BM_AddAlarm/1                       324      281    2239999
+</code></pre>
+
+</details>
+
+
+
+
+
+<details class="zippy">
+<summary><p>3.3X performance in index serving speed!</p>
+</summary>
+
+<p>We found a number of performance issues when planning a switch from on-disk to
+in-memory index serving in 2001. This change fixed many of these problems and
+took us from 150 to over 500 in-memory queries per second (for a 2 GB in-memory
+index on dual processor Pentium III machine).</p>
+<ul>
+<li>Lots of performance improvements to index block decoding speed (8.9 MB/s to
+13.1 MB/s for a microbenchmark).</li>
+<li>We now checksum the block during decoding. This allows us to implement all
+of our getsymbol operations to be done without any bounds checking.</li>
+<li>We have grungy macros that hold the various fields of a BitDecoder in local
+variables over entire loops, and then store them back at the end of the
+loops.</li>
+<li>We use inline assembly to get at the 'bsf' instruction on Intel chips for
+getUnary (finds index of first 1 bit in a word)</li>
+<li>When decoding values into a vector, we resize the vector outside of the loop
+and just walk a pointer along the vector, rather than doing a bounds-checked
+access to store every value.</li>
+<li>During docid decoding, we keep the docids in local docid space, to avoid
+multiplying by num_shards_. Only when we need the actual docid value do we
+multiply by num_shards_ and add my_shard_.</li>
+<li>The IndexBlockDecoder now exports an interface 'AdvanceToDocid' that returns
+the index of the first docid ≥ &quot;d&quot;. This permits the scanning to be done
+in terms of local docids, rather than forcing the conversion of each local
+docid to a global docid when the client calls GetDocid(index) for every
+index in the block.</li>
+<li>Decoding of position data for documents is now done on demand, rather than
+being done eagerly for the entire block when the client asked for position
+data for any document within the block.</li>
+<li>If the index block being decoded ends within 4 bytes of a page boundary, we
+copy it to a local buffer. This allows us to always load our bit decoding
+buffer via a 4-byte load, without having to worry about seg faults if we run
+off the end of a mmapped page.</li>
+<li>We only initialize the first nterms_ elements of various scoring data
+structures, rather than initializing all MAX_TERMS of them (in some cases,
+we were unnecessarily memsetting 20K to 100K of data per document scored).</li>
+<li>Avoid round_to_int and subsequent computation on intermediate scoring values
+when the value being computed is 0 (the subsequent computation was just
+writing '0' over the 0 that we had memset in these cases, and this was the
+most common case).</li>
+<li>Made a bounds check on scoring data structures into a debug-mode assertion.</li>
+</ul>
+
+</details>
+
+
+## Further reading
+
+In no particular order, a list of performance related books and articles that
+the authors have found helpful:
+
+*   [Optimizing software in C++](https://www.agner.org/optimize/optimizing_cpp.pdf)
+    by Agner Fog. Describes many useful low-level techniques for improving
+    performance.
+*   [Understanding Software Dynamics](https://www.oreilly.com/library/view/understanding-software-dynamics/9780137589692/)
+    by Richard L. Sites. Covers expert methods and advanced tools for diagnosing
+    and fixing performance problems.
+*   [Performance tips of the week](https://abseil.io/fast/) - a collection of
+    useful tips.
+*   [Performance Matters](https://travisdowns.github.io/) - a collection of
+    articles about performance.
+*   [Daniel Lemire's blog](https://lemire.me/blog/) - high performance
+    implementations of interesting algorithms.
+*   [Building Software Systems at Google and Lessons Learned](https://www.youtube.com/watch?v=modXC5IWTJI) -
+    a video that describes system performance issues encountered at Google over
+    a decade.
+*   [Programming Pearls](https://books.google.com/books/about/Programming_Pearls.html?id=kse_7qbWbjsC)
+    and
+    [More Programming Pearls: Confessions of a Coder](https://books.google.com/books/about/More_Programming_Pearls.html?id=a2AZAQAAIAAJ)
+    by Jon Bentley. Essays on starting with algorithms and ending up with simple
+    and efficient implementations.
+*   [Hacker's Delight](https://en.wikipedia.org/wiki/Hacker%27s_Delight) by
+    Henry S. Warren. Bit-level and arithmetic algorithms for solving some common
+    problems.
+*   [Computer Architecture: A Quantitative Approach](https://books.google.com/books/about/Computer_Architecture.html?id=cM8mDwAAQBAJ)
+    by John L. Hennessy and David A. Patterson - Covers many aspects of computer
+    architecture, including one that performance-minded software developers
+    should be aware of like caches, branch predictors, TLBs, etc.
+
+## Suggested citation
+
+If you want to cite this document, we suggest:
+
+```
+Jeffrey Dean & Sanjay Ghemawat, Performance Hints, 2025, https://abseil.io/fast/hints.html
+```
+
+Or in BibTeX:
+
+```bibtex
+@misc{DeanGhemawatPerformance2025,
+  author = {Dean, Jeffrey and Ghemawat, Sanjay},
+  title = {Performance Hints},
+  year = {2025},
+  howpublished = {\url{https://abseil.io/fast/hints.html}},
+}
+```
+
+## Acknowledgments
+
+Many colleagues have provided helpful feedback on this document, including:
+
+
+
+*   Adrian Ulrich
+*   Alexander Kuzmin
+*   Alexei Bendebury
+*   Alexey Alexandrov
+*   Amer Diwan
+*   Austin Sims
+*   Benoit Boissinot
+*   Brooks Moses
+*   Chris Kennelly
+*   Chris Ruemmler
+*   Danila Kutenin
+*   Darryl Gove
+*   David Majnemer
+*   Dmitry Vyukov
+*   Emanuel Taropa
+*   Felix Broberg
+*   Francis Birck Moreira
+*   Gideon Glass
+*   Henrik Stewenius
+*   Jeremy Dorfman
+*   John Dethridge
+*   Kurt Kluever
+*   Kyle Konrad
+*   Lucas Pereira
+*   Marc Eaddy
+*   Michael Marty
+*   Michael Whittaker
+*   Mircea Trofin
+*   Misha Brukman
+*   Nicolas Hillegeer
+*   Ranjit Mathew
+*   Rasmus Larsen
+*   Soheil Hassas Yeganeh
+*   Srdjan Petrovic
+*   Steinar H. Gunderson
+*   Stergios Stergiou
+*   Steven Timotius
+*   Sylvain Vignaud
+*   Thomas Etter
+*   Thomas Köppe
+*   Tim Chestnutt
+*   Todd Lipcon
+*   Vance Lankhaar
+*   Victor Costan
+*   Yao Zuo
+*   Zhou Fang
+*   Zuguang Yang
+
+
+
+
+
+
+
+
+[go benchmarks]: https://pkg.go.dev/testing#hdr-Benchmarks
+[fast39]: https://abseil.io/fast/39
+[fast53]: https://abseil.io/fast/53
+[fast75]: https://abseil.io/fast/75
+[cpp benchmarks]: https://github.com/google/benchmark/blob/main/README.md
+[jmh]: https://github.com/openjdk/jmh
+[xprof]: https://www.tensorflow.org/tensorboard/tensorboard_profiling_keras#debug_performance_bottlenecks
+[profile sources]: https://gperftools.github.io/gperftools/heapprofile.html
+[annotated source]: https://github.com/google/pprof/blob/main/doc/README.md#annotated-source-code
+[disassembly]: https://github.com/google/pprof/blob/main/doc/README.md#annotated-source-code
+[atomic danger]: https://abseil.io/docs/cpp/atomic_danger
+
+
diff --git a/js/perf_hints.js b/js/perf_hints.js
new file mode 100644
index 00000000..630bb24b
--- /dev/null
+++ b/js/perf_hints.js
@@ -0,0 +1,13 @@
+const expand = document.querySelector("#expand-details-button");
+const collapse = document.querySelector("#collapse-details-button"); 
+
+expand.addEventListener("click", (event) => {
+  document.body.querySelectorAll('details')
+  .forEach((it) => { if (!it.hasAttribute('open')) it.setAttribute('open', true)});
+});
+
+collapse.addEventListener("click", (event) => {
+  document.body.querySelectorAll('details')
+  .forEach((it) => {if (it.hasAttribute('open')) it.removeAttribute('open')});
+});
+