diff --git a/_includes/side-nav-fast.html b/_includes/side-nav-fast.html index 15002002..4768352e 100644 --- a/_includes/side-nav-fast.html +++ b/_includes/side-nav-fast.html @@ -2,6 +2,11 @@
Add bulk MemoryManager::LookupMany interface.
+In addition to adding a bulk interface, this also simplified the signature for +the new bulk variant: it turns out clients only needed to know if all the keys +were found, so we can return a bool rather than a Status object.
+memory_manager.h
+ +class MemoryManager {
+ public:
+ ...
+ util::StatusOr<LiveTensor> Lookup(const TensorIdProto& id);
+
+class MemoryManager {
+ public:
+ ...
+ util::StatusOr<LiveTensor> Lookup(const TensorIdProto& id);
+
+ // Lookup the identified tensors
+ struct LookupKey {
+ ClientHandle client;
+ uint64 local_id;
+ };
+ bool LookupMany(absl::Span<const LookupKey> keys,
+ absl::Span<tensorflow::Tensor> tensors);
+
+Add bulk ObjectStore::DeleteRefs API to amortize locking +overhead.
+object_store.h
+ +template <typename T>
+class ObjectStore {
+ public:
+ ...
+ absl::Status DeleteRef(Ref);
+
+template <typename T>
+class ObjectStore {
+ public:
+ ...
+ absl::Status DeleteRef(Ref);
+
+ // Delete many references. For each ref, if no other Refs point to the same
+ // object, the object will be deleted. Returns non-OK on any error.
+ absl::Status DeleteRefs(absl::Span<const Ref> refs);
+ ...
+template <typename T>
+absl::Status ObjectStore<T>::DeleteRefs(absl::Span<const Ref> refs) {
+ util::Status result;
+ absl::MutexLock l(&mu_);
+ for (auto ref : refs) {
+ result.Update(DeleteRefLocked(ref));
+ }
+ return result;
+}
+
+memory_tracking.cc
+ +void HandleBatch(int, const plaque::Batch& input) override {
+ for (const auto& t : input) {
+ auto in = In(t);
+ PLAQUE_OP_ASSIGN_OR_RETURN(const auto& handles, in.handles());
+ for (const auto handle : handles.value->handles()) {
+ PLAQUE_OP_RETURN_IF_ERROR(in_buffer_store_
+ ? bstore_->DeleteRef(handle)
+ : tstore_->DeleteRef(handle));
+ }
+ }
+}
+
+void HandleBatch(int, const plaque::Batch& input) override {
+ for (const auto& t : input) {
+ auto in = In(t);
+ PLAQUE_OP_ASSIGN_OR_RETURN(const auto& handles, in.handles());
+ if (in_buffer_store_) {
+ PLAQUE_OP_RETURN_IF_ERROR(
+ bstore_->DeleteRefs(handles.value->handles()));
+ } else {
+ PLAQUE_OP_RETURN_IF_ERROR(
+ tstore_->DeleteRefs(handles.value->handles()));
+ }
+ }
+}
+
+Use Floyd's +heap construction for efficient initialization.
+Bulk initialization of a heap can be done in O(N) time, whereas adding one +element at a time and updating the heap property after each addition requires +O(N lg(N)) time.
+ +Cache block decode results for use in future calls.
+Each lookup needs to decode a whole block of K entries. Store the decoded +entries in a cache and consult the cache on future lookups.
+lexicon.cc
+ +void GetTokenString(int pos, std::string* out) const {
+ ...
+ absl::FixedArray<LexiconEntry, 32> entries(pos + 1);
+
+ // Decode all lexicon entries up to and including pos.
+ for (int i = 0; i <= pos; ++i) {
+ p = util::coding::TwoValuesVarint::Decode32(p, &entries[i].remaining,
+ &entries[i].shared);
+ entries[i].remaining_str = p;
+ p += entries[i].remaining; // remaining bytes trail each entry.
+ }
+
+mutable std::vector<absl::InlinedVector<std::string, 16>> cache_;
+...
+void GetTokenString(int pos, std::string* out) const {
+ ...
+ DCHECK_LT(skentry, cache_.size());
+ if (!cache_[skentry].empty()) {
+ *out = cache_[skentry][pos];
+ return;
+ }
+ ...
+ // Init cache.
+ ...
+ const char* prev = p;
+ for (int i = 0; i < block_sz; ++i) {
+ uint32 shared, remaining;
+ p = TwoValuesVarint::Decode32(p, &remaining, &shared);
+ auto& cur = cache_[skentry].emplace_back();
+ gtl::STLStringResizeUninitialized(&cur, remaining + shared);
+
+ std::memcpy(cur.data(), prev, shared);
+ std::memcpy(cur.data() + shared, p, remaining);
+ prev = cur.data();
+ p += remaining;
+ }
+ *out = cache_[skentry][pos];
+
+Add RPC_Stats::RecordRPC variant allowing client to pass in +already available WallTime value.
+rpc-stats.h
+ +static void RecordRPC(const Name &name, const RPC_Stats_Measurement& m);
+
+static void RecordRPC(const Name &name, const RPC_Stats_Measurement& m,
+ WallTime now);
+
+clientchannel.cc
+ +const WallTime now = WallTime_Now();
+...
+RPC_Stats::RecordRPC(stats_name, m);
+
+const WallTime now = WallTime_Now();
+...
+RPC_Stats::RecordRPC(stats_name, m, now);
+
+Make a class thread-compatible since callers are already +synchronized.
+hitless-transfer-phase.cc
+ +TransferPhase HitlessTransferPhase::get() const {
+ static CallsiteMetrics cm("HitlessTransferPhase::get");
+ MonitoredMutexLock l(&cm, &mutex_);
+ return phase_;
+}
+
+TransferPhase HitlessTransferPhase::get() const { return phase_; }
+
+hitless-transfer-phase.cc
+ +bool HitlessTransferPhase::AllowAllocate() const {
+ static CallsiteMetrics cm("HitlessTransferPhase::AllowAllocate");
+ MonitoredMutexLock l(&cm, &mutex_);
+ return phase_ == TransferPhase::kNormal || phase_ == TransferPhase::kBrownout;
+}
+
+bool HitlessTransferPhase::AllowAllocate() const {
+ return phase_ == TransferPhase::kNormal || phase_ == TransferPhase::kBrownout;
+}
+
+Add nodes to cycle detection structure in reverse +post-order.
+We were previously adding graph nodes and edges one at a time to a +cycle-detection data structure, which required expensive work per edge. We now +add the entire graph in reverse post-order, which makes cycle-detection trivial.
+graphcycles.h
+ +class GraphCycles : public util_graph::Graph {
+ public:
+ GraphCycles();
+ ~GraphCycles() override;
+
+ using Node = util_graph::Node;
+
+class GraphCycles : public util_graph::Graph {
+ public:
+ GraphCycles();
+ ~GraphCycles() override;
+
+ using Node = util_graph::Node;
+
+ // InitFrom adds all the nodes and edges from src, returning true if
+ // successful, false if a cycle is encountered.
+ // REQUIRES: no nodes and edges have been added to GraphCycles yet.
+ bool InitFrom(const util_graph::Graph& src);
+
+graphcycles.cc
+ +bool GraphCycles::InitFrom(const util_graph::Graph& src) {
+ ...
+ // Assign ranks in topological order so we don't need any reordering during
+ // initialization. For an acyclic graph, DFS leaves nodes in reverse
+ // topological order, so we assign decreasing ranks to nodes as we leave them.
+ Rank last_rank = n;
+ auto leave = [&](util_graph::Node node) {
+ DCHECK(r->rank[node] == kMissingNodeRank);
+ NodeInfo* nn = &r->nodes[node];
+ nn->in = kNil;
+ nn->out = kNil;
+ r->rank[node] = --last_rank;
+ };
+ util_graph::DFSAll(src, std::nullopt, leave);
+
+ // Add all the edges (detect cycles as we go).
+ bool have_cycle = false;
+ util_graph::PerEdge(src, [&](util_graph::Edge e) {
+ DCHECK_NE(r->rank[e.src], kMissingNodeRank);
+ DCHECK_NE(r->rank[e.dst], kMissingNodeRank);
+ if (r->rank[e.src] >= r->rank[e.dst]) {
+ have_cycle = true;
+ } else if (!HasEdge(e.src, e.dst)) {
+ EdgeListAddNode(r, &r->nodes[e.src].out, e.dst);
+ EdgeListAddNode(r, &r->nodes[e.dst].in, e.src);
+ }
+ });
+ if (have_cycle) {
+ return false;
+ } else {
+ DCHECK(CheckInvariants());
+ return true;
+ }
+}
+
+graph_partitioner.cc
+ +absl::Status MergeGraph::Init() {
+ const Graph& graph = *compiler_->graph();
+ clusters_.resize(graph.NodeLimit());
+ graph.PerNode([&](Node node) {
+ graph_->AddNode(node);
+ NodeList* n = new NodeList;
+ n->push_back(node);
+ clusters_[node] = n;
+ });
+ absl::Status s;
+ PerEdge(graph, [&](Edge e) {
+ if (!s.ok()) return;
+ if (graph_->HasEdge(e.src, e.dst)) return; // already added
+ if (!graph_->InsertEdge(e.src, e.dst)) {
+ s = absl::InvalidArgumentError("cycle in the original graph");
+ }
+ });
+ return s;
+}
+
+absl::Status MergeGraph::Init() {
+ const Graph& graph = *compiler_->graph();
+ if (!graph_->InitFrom(graph)) {
+ return absl::InvalidArgumentError("cycle in the original graph");
+ }
+ clusters_.resize(graph.NodeLimit());
+ graph.PerNode([&](Node node) {
+ NodeList* n = new NodeList;
+ n->push_back(node);
+ clusters_[node] = n;
+ });
+ return absl::OkStatus();
+}
+
+Replace the deadlock detection system built into a mutex +implementation with a better algorithm.
+Replaced deadlock detection algorithm by one that is ~50x as fast and scales to +millions of mutexes without problem (the old algorithm relied on a 2K limit to +avoid a performance cliff). The new code is based on the following paper: A +dynamic topological sort algorithm for directed acyclic graphs David J. Pearce, +Paul H. J. Kelly Journal of Experimental Algorithmics (JEA) JEA Homepage archive +Volume 11, 2006, Article No. 1.7
+The new algorithm takes O(|V|+|E|) space (instead of the O(|V|^2) bits needed by +the older algorithm). Lock-acquisition order graphs are very sparse, so this is +much less space. The algorithm is also quite simple: the core of it is ~100 +lines of C++. Since the code now scales to much larger number of Mutexes, we +were able to relax an artificial 2K limit, which uncovered a number of latent +deadlocks in real programs.
+Benchmark results: these were run in DEBUG mode since deadlock detection is +mainly enabled in debug mode. The benchmark argument (/2k etc.) is the number of +tracked nodes. At the default 2k limit of the old algorithm, the new algorithm +takes only 0.5 microseconds per InsertEdge compared to 22 microseconds for the +old algorithm. The new algorithm also easily scales to much larger graphs +without problems whereas the old algorithm keels over quickly.
+DEBUG: Benchmark Time(ns) CPU(ns) Iterations
+----------------------------------------------------------
+DEBUG: BM_StressTest/2k 23553 23566 29086
+DEBUG: BM_StressTest/4k 45879 45909 15287
+DEBUG: BM_StressTest/16k 776938 777472 817
+
+DEBUG: BM_StressTest/2k 392 393 10485760
+DEBUG: BM_StressTest/4k 392 393 10485760
+DEBUG: BM_StressTest/32k 407 407 10485760
+DEBUG: BM_StressTest/256k 456 456 10485760
+DEBUG: BM_StressTest/1M 534 534 10485760
+
+
+Replace an IntervalMap (with O(lg N) lookups) with a hash +table (O(1) lookups).
+The initial code was using IntervalMap because it seemed like the right data +structure to support coalescing of adjacent blocks, but a hash table suffices +since the adjacent block can be found by a hash table lookup. This (plus other +changes in the CL) improve the performance of tpu::BestFitAllocator by ~4X.
+best_fit_allocator.h
+ +using Block = gtl::IntervalMap<int64, BlockState>::Entry;
+...
+// Map of pairs (address range, BlockState) with one entry for each allocation
+// covering the range [0, allocatable_range_end_). Adjacent kFree and
+// kReserved blocks are coalesced. Adjacent kAllocated blocks are not
+// coalesced.
+gtl::IntervalMap<int64, BlockState> block_list_;
+
+// Set of all free blocks sorted according to the allocation policy. Adjacent
+// free blocks are coalesced.
+std::set<Block, BlockSelector> free_list_;
+
+// A faster hash function for offsets in the BlockTable
+struct OffsetHash {
+ ABSL_ATTRIBUTE_ALWAYS_INLINE size_t operator()(int64 value) const {
+ uint64 m = value;
+ m *= uint64_t{0x9ddfea08eb382d69};
+ return static_cast<uint64_t>(m ^ (m >> 32));
+ }
+};
+
+// Hash table maps from block start address to block info.
+// We include the length of the previous block in this info so we
+// can find the preceding block to coalesce with.
+struct HashTableEntry {
+ BlockState state;
+ int64 my_length;
+ int64 prev_length; // Zero if there is no previous block.
+};
+using BlockTable = absl::flat_hash_map<int64, HashTableEntry, OffsetHash>;
+
+Replace sorted-list intersection (O(N log N)) with hash +table lookups (O(N)).
+Old code to detect whether or not two nodes share a common source would get the +sources for each node in sorted order and then do a sorted intersection. The new +code places the sources for one node in a hash-table and then iterates over the +other node's sources checking the hash-table.
+name old time/op new time/op delta
+BM_CompileLarge 28.5s ± 2% 22.4s ± 2% -21.61% (p=0.008 n=5+5)
+
+
+Implement good hash function so that things are O(1) +instead of O(N).
+location.h
+ +// Hasher for Location objects.
+struct LocationHash {
+ size_t operator()(const Location* key) const {
+ return key != nullptr ? util_hash::Hash(key->address()) : 0;
+ }
+};
+
+size_t HashLocation(const Location& loc);
+...
+struct LocationHash {
+ size_t operator()(const Location* key) const {
+ return key != nullptr ? HashLocation(*key) : 0;
+ }
+};
+
+location.cc
+ +size_t HashLocation(const Location& loc) {
+ util_hash::MurmurCat m;
+
+ // Encode some simpler features into a single value.
+ m.AppendAligned((loc.dynamic() ? 1 : 0) //
+ | (loc.append_shard_to_address() ? 2 : 0) //
+ | (loc.is_any() ? 4 : 0) //
+ | (!loc.any_of().empty() ? 8 : 0) //
+ | (loc.has_shardmap() ? 16 : 0) //
+ | (loc.has_sharding() ? 32 : 0));
+
+ if (loc.has_shardmap()) {
+ m.AppendAligned(loc.shardmap().output() |
+ static_cast<uint64_t>(loc.shardmap().stmt()) << 20);
+ }
+ if (loc.has_sharding()) {
+ uint64_t num = 0;
+ switch (loc.sharding().type_case()) {
+ case Sharding::kModShard:
+ num = loc.sharding().mod_shard();
+ break;
+ case Sharding::kRangeSplit:
+ num = loc.sharding().range_split();
+ break;
+ case Sharding::kNumShards:
+ num = loc.sharding().num_shards();
+ break;
+ default:
+ num = 0;
+ break;
+ }
+ m.AppendAligned(static_cast<uint64_t>(loc.sharding().type_case()) |
+ (num << 3));
+ }
+
+ auto add_string = [&m](absl::string_view s) {
+ if (!s.empty()) {
+ m.Append(s.data(), s.size());
+ }
+ };
+
+ add_string(loc.address());
+ add_string(loc.lb_policy());
+
+ // We do not include any_of since it is complicated to compute a hash
+ // value that is not sensitive to order and duplication.
+ return m.GetHash();
+}
+
+Reduce allocations and improve cache footprint by +converting btree<a,btree<b,c>> to btree<pair<a,b>,c>.
+graph_splitter.cc
+ +absl::btree_map<std::string, absl::btree_map<std::string, OpDef>> ops;
+
+// The btree maps from {package_name, op_name} to its const Opdef*.
+absl::btree_map<std::pair<absl::string_view, absl::string_view>,
+ const OpDef*>
+ ops;
+
+Switch to a nested map leads to 76% performance +improvement in microbenchmark.
+We previously had a single-level hash table where the key consisted of a +(string) path and some other numeric sub-keys. Each path occurred in +approximately 1000 keys on average. We split the hash table into two levels +where the first level was keyed by the path and each second level hash table +kept just the sub-key to data mapping for a particular path. This reduced the +memory usage for storing paths by a factor of 1000, and also sped up accesses +where many sub-keys for the same path were accessed together.
+ +Use an array instead of flat_map.
+rtp_controller.h
+ +const gtl::flat_map<int, int> payload_type_to_clock_frequency_;
+
+// A map (implemented as a simple array) indexed by payload_type to clock freq
+// for that paylaod type (or 0)
+struct PayloadTypeToClockRateMap {
+ int map[128];
+};
+...
+const PayloadTypeToClockRateMap payload_type_to_clock_frequency_;
+
+Spanner placement system. Replace +dense_hash_set<ZoneId> with a bit-vector with one bit per zone.
+zone_set.h
+ +class ZoneSet: public dense_hash_set<ZoneId> {
+ public:
+ ...
+ bool Contains(ZoneId zone) const {
+ return count(zone) > 0;
+ }
+
+class ZoneSet {
+ ...
+ // Returns true iff "zone" is contained in the set
+ bool ContainsZone(ZoneId zone) const {
+ return zone < b_.size() && b_.get_bit(zone);
+ }
+ ...
+ private:
+ int size_; // Number of zones inserted
+ util::bitmap::InlinedBitVector<256> b_;
+
+Benchmark results:
+CPU: AMD Opteron (4 cores) dL1:64KB dL2:1024KB
+Benchmark Base (ns) New (ns) Improvement
+------------------------------------------------------------------
+BM_Evaluate/1 960 676 +29.6%
+BM_Evaluate/2 1661 1138 +31.5%
+BM_Evaluate/3 2305 1640 +28.9%
+BM_Evaluate/4 3053 2135 +30.1%
+BM_Evaluate/5 3780 2665 +29.5%
+BM_Evaluate/10 7819 5739 +26.6%
+BM_Evaluate/20 17922 12338 +31.2%
+BM_Evaluate/40 36836 26430 +28.2%
+
+
+Use bit matrix to keep track of reachability properties +between operands instead of hash table.
+hlo_computation.h
+ +using TransitiveOperandMap =
+ std::unordered_map<const HloInstruction*,
+ std::unordered_set<const HloInstruction*>>;
+
+class HloComputation::ReachabilityMap {
+ ...
+ // dense id assignment from HloInstruction* to number
+ tensorflow::gtl::FlatMap<const HloInstruction*, int> ids_;
+ // matrix_(a,b) is true iff b is reachable from a
+ tensorflow::core::Bitmap matrix_;
+};
+
+Reducing allocations increases benchmark throughput by +21%.
+memory_manager.cc
+ +LiveTensor::LiveTensor(tf::Tensor t, std::shared_ptr<const DeviceInfo> dinfo,
+ bool is_batched)
+ : tensor(std::move(t)),
+ device_info(dinfo ? std::move(dinfo) : std::make_shared<DeviceInfo>()),
+ is_batched(is_batched) {
+
+static const std::shared_ptr<DeviceInfo>& empty_device_info() {
+ static std::shared_ptr<DeviceInfo>* result =
+ new std::shared_ptr<DeviceInfo>(new DeviceInfo);
+ return *result;
+}
+
+LiveTensor::LiveTensor(tf::Tensor t, std::shared_ptr<const DeviceInfo> dinfo,
+ bool is_batched)
+ : tensor(std::move(t)), is_batched(is_batched) {
+ if (dinfo) {
+ device_info = std::move(dinfo);
+ } else {
+ device_info = empty_device_info();
+ }
+
+Use statically-allocated zero vector when possible rather +than allocating a vector and filling it with zeroes.
+embedding_executor_8bit.cc
+// The actual implementation of the EmbeddingLookUpT using template parameters
+// instead of object members to improve the performance.
+template <bool Mean, bool SymmetricInputRange>
+static tensorflow::Status EmbeddingLookUpT(...) {
+ ...
+ std::unique_ptr<tensorflow::quint8[]> zero_data(
+ new tensorflow::quint8[max_embedding_width]);
+ memset(zero_data.get(), 0, sizeof(tensorflow::quint8) * max_embedding_width);
+
+// A size large enough to handle most embedding widths
+static const int kTypicalMaxEmbedding = 256;
+static tensorflow::quint8 static_zero_data[kTypicalMaxEmbedding]; // All zeroes
+...
+// The actual implementation of the EmbeddingLookUpT using template parameters
+// instead of object members to improve the performance.
+template <bool Mean, bool SymmetricInputRange>
+static tensorflow::Status EmbeddingLookUpT(...) {
+ ...
+ std::unique_ptr<tensorflow::quint8[]> zero_data_backing(nullptr);
+
+ // Get a pointer to a memory area with at least
+ // "max_embedding_width" quint8 zero values.
+ tensorflow::quint8* zero_data;
+ if (max_embedding_width <= ARRAYSIZE(static_zero_data)) {
+ // static_zero_data is big enough so we don't need to allocate zero data
+ zero_data = &static_zero_data[0];
+ } else {
+ // static_zero_data is not big enough: we need to allocate zero data
+ zero_data_backing =
+ absl::make_unique<tensorflow::quint8[]>(max_embedding_width);
+ memset(zero_data_backing.get(), 0,
+ sizeof(tensorflow::quint8) * max_embedding_width);
+ zero_data = zero_data_backing.get();
+ }
+
+
+Pre-size a vector and fill it in, rather than N push_back +operations.
+indexblockdecoder.cc
+ +for (int i = 0; i < ndocs-1; i++) {
+ uint32 delta;
+ ERRORCHECK(b->GetRice(rice_base, &delta));
+ docs_.push_back(DocId(my_shard_ + (base + delta) * num_shards_));
+ base = base + delta + 1;
+}
+docs_.push_back(last_docid_);
+
+docs_.resize(ndocs);
+DocId* docptr = &docs_[0];
+for (int i = 0; i < ndocs-1; i++) {
+ uint32 delta;
+ ERRORCHECK(b.GetRice(rice_base, &delta));
+ *docptr = DocId(my_shard_ + (base + delta) * num_shards_);
+ docptr++;
+ base = base + delta + 1;
+}
+*docptr = last_docid_;
+
+Avoid an extra copy when receiving a tensor via gRPC.
+A benchmark that sends around 400KB tensors speeds up by ~10-15%:
+Benchmark Time(ns) CPU(ns) Iterations
+-----------------------------------------------------
+BM_RPC/30/98k_mean 148764691 1369998944 1000
+
+Benchmark Time(ns) CPU(ns) Iterations
+-----------------------------------------------------
+BM_RPC/30/98k_mean 131595940 1216998084 1000
+
+
+Move large options structure rather than copying it.
+index.cc
+ +return search_iterators::DocPLIteratorFactory::Create(opts);
+
+return search_iterators::DocPLIteratorFactory::Create(std::move(opts));
+
+Use std::sort instead of std::stable_sort, which avoids +an internal copy inside the stable sort implementation.
+encoded-vector-hits.h
+ +std::stable_sort(hits_.begin(), hits_.end(),
+ gtl::OrderByField(&HitWithPayloadOffset::docid));
+
+struct HitWithPayloadOffset {
+ search_iterators::LocalDocId64 docid;
+ int first_payload_offset; // offset into the payload vector.
+ int num_payloads;
+
+ bool operator<(const HitWithPayloadOffset& other) const {
+ return (docid < other.docid) ||
+ (docid == other.docid &&
+ first_payload_offset < other.first_payload_offset);
+ }
+};
+ ...
+ std::sort(hits_.begin(), hits_.end());
+
+Hoist variable definition outside of loop iteration.
+autofdo_profile_utils.h
+ +auto iterator = absl::WrapUnique(sstable->GetIterator());
+while (!iterator->done()) {
+ T profile;
+ if (!profile.ParseFromString(iterator->value_view())) {
+ return absl::InternalError(
+ "Failed to parse mem_block to specified profile type.");
+ }
+ ...
+ iterator->Next();
+}
+
+auto iterator = absl::WrapUnique(sstable->GetIterator());
+T profile;
+while (!iterator->done()) {
+ if (!profile.ParseFromString(iterator->value_view())) {
+ return absl::InternalError(
+ "Failed to parse mem_block to specified profile type.");
+ }
+ ...
+ iterator->Next();
+}
+
+Define a protobuf variable outside a loop so that its +allocated storage can be reused across loop iterations.
+stats-router.cc
+ +for (auto& r : routers_to_update) {
+ ...
+ ResourceRecord record;
+ {
+ MutexLock agg_lock(r.agg->mutex());
+ r.agg->AddResourceRecordUsages(measure_indices, &record);
+ }
+ ...
+}
+
+ResourceRecord record;
+for (auto& r : routers_to_update) {
+ ...
+ record.Clear();
+ {
+ MutexLock agg_lock(r.agg->mutex());
+ r.agg->AddResourceRecordUsages(measure_indices, &record);
+ }
+ ...
+}
+
+Serialize to same std::string repeatedly.
+program_rep.cc
+ +std::string DeterministicSerialization(const proto2::Message& m) {
+ std::string result;
+ proto2::io::StringOutputStream sink(&result);
+ proto2::io::CodedOutputStream out(&sink);
+ out.SetSerializationDeterministic(true);
+ m.SerializePartialToCodedStream(&out);
+ return result;
+}
+
+absl::string_view DeterministicSerializationTo(const proto2::Message& m,
+ std::string* scratch) {
+ scratch->clear();
+ proto2::io::StringOutputStream sink(scratch);
+ proto2::io::CodedOutputStream out(&sink);
+ out.SetSerializationDeterministic(true);
+ m.SerializePartialToCodedStream(&out);
+ return absl::string_view(*scratch);
+}
+
+Make fast path cover more common cases.
+Add handling of trailing single ASCII bytes, rather than only handling multiples +of four bytes with this routine. This avoids calling the slower generic routine +for all-ASCII strings that are, for example, 5 bytes.
+utf8statetable.cc
+ +// Scan a UTF-8 stringpiece based on state table.
+// Always scan complete UTF-8 characters
+// Set number of bytes scanned. Return reason for exiting
+// OPTIMIZED for case of 7-bit ASCII 0000..007f all valid
+int UTF8GenericScanFastAscii(const UTF8ScanObj* st, absl::string_view str,
+ int* bytes_consumed) {
+ ...
+ int exit_reason;
+ do {
+ // Skip 8 bytes of ASCII at a whack; no endianness issue
+ while ((src_limit - src >= 8) &&
+ (((UNALIGNED_LOAD32(src + 0) | UNALIGNED_LOAD32(src + 4)) &
+ 0x80808080) == 0)) {
+ src += 8;
+ }
+ // Run state table on the rest
+ int rest_consumed;
+ exit_reason = UTF8GenericScan(
+ st, absl::ClippedSubstr(str, src - initial_src), &rest_consumed);
+ src += rest_consumed;
+ } while (exit_reason == kExitDoAgain);
+
+ *bytes_consumed = src - initial_src;
+ return exit_reason;
+}
+
+// Scan a UTF-8 stringpiece based on state table.
+// Always scan complete UTF-8 characters
+// Set number of bytes scanned. Return reason for exiting
+// OPTIMIZED for case of 7-bit ASCII 0000..007f all valid
+int UTF8GenericScanFastAscii(const UTF8ScanObj* st, absl::string_view str,
+ int* bytes_consumed) {
+ ...
+ int exit_reason = kExitOK;
+ do {
+ // Skip 8 bytes of ASCII at a whack; no endianness issue
+ while ((src_limit - src >= 8) &&
+ (((UNALIGNED_LOAD32(src + 0) | UNALIGNED_LOAD32(src + 4)) &
+ 0x80808080) == 0)) {
+ src += 8;
+ }
+ while (src < src_limit && Is7BitAscii(*src)) { // Skip ASCII bytes
+ src++;
+ }
+ if (src < src_limit) {
+ // Run state table on the rest
+ int rest_consumed;
+ exit_reason = UTF8GenericScan(
+ st, absl::ClippedSubstr(str, src - initial_src), &rest_consumed);
+ src += rest_consumed;
+ }
+ } while (exit_reason == kExitDoAgain);
+
+ *bytes_consumed = src - initial_src;
+ return exit_reason;
+}
+
+Simpler fast paths for InlinedVector.
+inlined_vector.h
+ +auto Storage<T, N, A>::Resize(ValueAdapter values, size_type new_size) -> void {
+ StorageView storage_view = MakeStorageView();
+
+ IteratorValueAdapter<MoveIterator> move_values(
+ MoveIterator(storage_view.data));
+
+ AllocationTransaction allocation_tx(GetAllocPtr());
+ ConstructionTransaction construction_tx(GetAllocPtr());
+
+ absl::Span<value_type> construct_loop;
+ absl::Span<value_type> move_construct_loop;
+ absl::Span<value_type> destroy_loop;
+
+ if (new_size > storage_view.capacity) {
+ ...
+ } else if (new_size > storage_view.size) {
+ construct_loop = {storage_view.data + storage_view.size,
+ new_size - storage_view.size};
+ } else {
+ destroy_loop = {storage_view.data + new_size, storage_view.size - new_size};
+ }
+
+auto Storage<T, N, A>::Resize(ValueAdapter values, size_type new_size) -> void {
+ StorageView storage_view = MakeStorageView();
+ auto* const base = storage_view.data;
+ const size_type size = storage_view.size;
+ auto* alloc = GetAllocPtr();
+ if (new_size <= size) {
+ // Destroy extra old elements.
+ inlined_vector_internal::DestroyElements(alloc, base + new_size,
+ size - new_size);
+ } else if (new_size <= storage_view.capacity) {
+ // Construct new elements in place.
+ inlined_vector_internal::ConstructElements(alloc, base + size, &values,
+ new_size - size);
+ } else {
+ ...
+ }
+
+Fast path for common cases of initializing 1-D to 4-D +tensors.
+tensor_shape.cc
+ +template <class Shape>
+TensorShapeBase<Shape>::TensorShapeBase(gtl::ArraySlice<int64> dim_sizes) {
+ set_tag(REP16);
+ set_data_type(DT_INVALID);
+ set_ndims_byte(0);
+ set_num_elements(1);
+ for (int64 s : dim_sizes) {
+ AddDim(internal::SubtleMustCopy(s));
+ }
+}
+
+template <class Shape>
+void TensorShapeBase<Shape>::InitDims(gtl::ArraySlice<int64> dim_sizes) {
+ DCHECK_EQ(tag(), REP16);
+
+ // Allow sizes that are under kint64max^0.25 so that 4-way multiplication
+ // below cannot overflow.
+ static const uint64 kMaxSmall = 0xd744;
+ static_assert(kMaxSmall * kMaxSmall * kMaxSmall * kMaxSmall <= kint64max,
+ "bad overflow check");
+ bool large_size = false;
+ for (auto s : dim_sizes) {
+ if (s > kMaxSmall) {
+ large_size = true;
+ break;
+ }
+ }
+
+ if (!large_size) {
+ // Every size fits in 16 bits; use fast-paths for dims in {1,2,3,4}.
+ uint16* dst = as16()->dims_;
+ switch (dim_sizes.size()) {
+ case 1: {
+ set_ndims_byte(1);
+ const int64 size = dim_sizes[0];
+ const bool neg = Set16(kIsPartial, dst, 0, size);
+ set_num_elements(neg ? -1 : size);
+ return;
+ }
+ case 2: {
+ set_ndims_byte(2);
+ const int64 size0 = dim_sizes[0];
+ const int64 size1 = dim_sizes[1];
+ bool neg = Set16(kIsPartial, dst, 0, size0);
+ neg |= Set16(kIsPartial, dst, 1, size1);
+ set_num_elements(neg ? -1 : (size0 * size1));
+ return;
+ }
+ case 3: {
+ ...
+ }
+ case 4: {
+ ...
+ }
+ }
+ }
+
+ set_ndims_byte(0);
+ set_num_elements(1);
+ for (int64 s : dim_sizes) {
+ AddDim(internal::SubtleMustCopy(s));
+ }
+}
+
+Make varint parser fast path cover just the 1-byte case, +instead of covering 1-byte and 2-byte cases.
+Reducing the size of the (inlined) fast path reduces code size and icache +pressure, which leads to improved performance.
+parse_context.h
+ +template <typename T>
+PROTOBUF_NODISCARD const char* VarintParse(const char* p, T* out) {
+ auto ptr = reinterpret_cast<const uint8_t*>(p);
+ uint32_t res = ptr[0];
+ if (!(res & 0x80)) {
+ *out = res;
+ return p + 1;
+ }
+ uint32_t byte = ptr[1];
+ res += (byte - 1) << 7;
+ if (!(byte & 0x80)) {
+ *out = res;
+ return p + 2;
+ }
+ return VarintParseSlow(p, res, out);
+}
+
+template <typename T>
+PROTOBUF_NODISCARD const char* VarintParse(const char* p, T* out) {
+ auto ptr = reinterpret_cast<const uint8_t*>(p);
+ uint32_t res = ptr[0];
+ if (!(res & 0x80)) {
+ *out = res;
+ return p + 1;
+ }
+ return VarintParseSlow(p, res, out);
+}
+
+parse_context.cc
+ +std::pair<const char*, uint32_t> VarintParseSlow32(const char* p,
+ uint32_t res) {
+ for (std::uint32_t i = 2; i < 5; i++) {
+ ...
+}
+...
+std::pair<const char*, uint64_t> VarintParseSlow64(const char* p,
+ uint32_t res32) {
+ uint64_t res = res32;
+ for (std::uint32_t i = 2; i < 10; i++) {
+ ...
+}
+
+std::pair<const char*, uint32_t> VarintParseSlow32(const char* p,
+ uint32_t res) {
+ for (std::uint32_t i = 1; i < 5; i++) {
+ ...
+}
+...
+std::pair<const char*, uint64_t> VarintParseSlow64(const char* p,
+ uint32_t res32) {
+ uint64_t res = res32;
+ for (std::uint32_t i = 1; i < 10; i++) {
+ ...
+}
+
+Skip significant work in RPC_Stats_Measurement addition if +no errors have occurred.
+rpc-stats.h
+ +struct RPC_Stats_Measurement {
+ ...
+ double errors[RPC::NUM_ERRORS];
+
+struct RPC_Stats_Measurement {
+ ...
+ double get_errors(int index) const { return errors[index]; }
+ void set_errors(int index, double value) {
+ errors[index] = value;
+ any_errors_set = true;
+ }
+ private:
+ ...
+ // We make this private so that we can keep track of whether any of
+ // these values have been set to non-zero values.
+ double errors[RPC::NUM_ERRORS];
+ bool any_errors_set; // True iff any of the errors[i] values are non-zero
+
+rpc-stats.cc
+ +void RPC_Stats_Measurement::operator+=(const RPC_Stats_Measurement& x) {
+ ...
+ for (int i = 0; i < RPC::NUM_ERRORS; ++i) {
+ errors[i] += x.errors[i];
+ }
+}
+
+void RPC_Stats_Measurement::operator+=(const RPC_Stats_Measurement& x) {
+ ...
+ if (x.any_errors_set) {
+ for (int i = 0; i < RPC::NUM_ERRORS; ++i) {
+ errors[i] += x.errors[i];
+ }
+ any_errors_set = true;
+ }
+}
+
+Do array lookup on first byte of string to often avoid +fingerprinting full string.
+soft-tokens-helper.cc
+ +bool SoftTokensHelper::IsSoftToken(const StringPiece& token) const {
+ return soft_tokens_.find(Fingerprint(token.data(), token.size())) !=
+ soft_tokens_.end();
+}
+
+soft-tokens-helper.h
+ +class SoftTokensHelper {
+ ...
+ private:
+ ...
+ // Since soft tokens are mostly punctuation-related, for performance
+ // purposes, we keep an array filter_. filter_[i] is true iff any
+ // of the soft tokens start with the byte value 'i'. This avoids
+ // fingerprinting a term in the common case, since we can just do an array
+ // lookup based on the first byte, and if filter_[b] is false, then
+ // we can return false immediately.
+ bool filter_[256];
+ ...
+};
+
+inline bool SoftTokensHelper::IsSoftToken(const StringPiece& token) const {
+ if (token.size() >= 1) {
+ char first_char = token.data()[0];
+ if (!filter_[first_char]) {
+ return false;
+ }
+ }
+ return IsSoftTokenFallback(token);
+}
+
+soft-tokens-helper.cc
+ +bool SoftTokensHelper::IsSoftTokenFallback(const StringPiece& token) const {
+ return soft_tokens_.find(Fingerprint(token.data(), token.size())) !=
+ soft_tokens_.end();
+}
+
+Precompute a TensorFlow graph execution node property +that allows us to quickly rule out certain unusual cases.
+executor.cc
+ +struct NodeItem {
+ ...
+ bool kernel_is_expensive = false; // True iff kernel->IsExpensive()
+ bool kernel_is_async = false; // True iff kernel->AsAsync() != nullptr
+ bool is_merge = false; // True iff IsMerge(node)
+ ...
+ if (IsEnter(node)) {
+ ...
+ } else if (IsExit(node)) {
+ ...
+ } else if (IsNextIteration(node)) {
+ ...
+ } else {
+ // Normal path for most nodes
+ ...
+ }
+
+struct NodeItem {
+ ...
+ bool kernel_is_expensive : 1; // True iff kernel->IsExpensive()
+ bool kernel_is_async : 1; // True iff kernel->AsAsync() != nullptr
+ bool is_merge : 1; // True iff IsMerge(node)
+ bool is_enter : 1; // True iff IsEnter(node)
+ bool is_exit : 1; // True iff IsExit(node)
+ bool is_control_trigger : 1; // True iff IsControlTrigger(node)
+ bool is_sink : 1; // True iff IsSink(node)
+ // True iff IsEnter(node) || IsExit(node) || IsNextIteration(node)
+ bool is_enter_exit_or_next_iter : 1;
+ ...
+ if (!item->is_enter_exit_or_next_iter) {
+ // Fast path for nodes types that don't need special handling
+ DCHECK_EQ(input_frame, output_frame);
+ ...
+ } else if (item->is_enter) {
+ ...
+ } else if (item->is_exit) {
+ ...
+ } else {
+ DCHECK(IsNextIteration(node));
+ ...
+ }
+
+Precompute 256 element array and use during trigram +initialization.
+byte_trigram_classifier.cc
+ +void ByteTrigramClassifier::VerifyModel(void) const {
+ ProbT class_sums[num_classes_];
+ for (int cls = 0; cls < num_classes_; cls++) {
+ class_sums[cls] = 0;
+ }
+ for (ByteNgramId id = 0; id < trigrams_.num_trigrams(); id++) {
+ for (int cls = 0; cls < num_classes_; ++cls) {
+ class_sums[cls] += Prob(trigram_probs_[id].log_probs[cls]);
+ }
+ }
+ ...
+}
+
+void ByteTrigramClassifier::VerifyModel(void) const {
+ CHECK_EQ(sizeof(ByteLogProbT), 1);
+ ProbT fast_prob[256];
+ for (int b = 0; b < 256; b++) {
+ fast_prob[b] = Prob(static_cast<ByteLogProbT>(b));
+ }
+
+ ProbT class_sums[num_classes_];
+ for (int cls = 0; cls < num_classes_; cls++) {
+ class_sums[cls] = 0;
+ }
+ for (ByteNgramId id = 0; id < trigrams_.num_trigrams(); id++) {
+ for (int cls = 0; cls < num_classes_; ++cls) {
+ class_sums[cls] += fast_prob[trigram_probs_[id].log_probs[cls]];
+ }
+ }
+ ...
+}
+
+Move bounds computation outside loop.
+literal_linearizer.cc
+ +for (int64 i = 0; i < src_shape.dimensions(dimension_numbers.front());
+ ++i) {
+
+int64 dim_front = src_shape.dimensions(dimension_numbers.front());
+const uint8* src_buffer_data = src_buffer.data();
+uint8* dst_buffer_data = dst_buffer.data();
+for (int64 i = 0; i < dim_front; ++i) {
+
+Defer GetSubSharding call until needed, which reduces 43 +seconds of CPU time to 2 seconds.
+sharding_propagation.cc
+ +HloSharding alternative_sub_sharding =
+ user.sharding().GetSubSharding(user.shape(), {i});
+if (user.operand(i) == &instruction &&
+ hlo_sharding_util::IsShardingMoreSpecific(alternative_sub_sharding,
+ sub_sharding)) {
+ sub_sharding = alternative_sub_sharding;
+}
+
+if (user.operand(i) == &instruction) {
+ // Only evaluate GetSubSharding if this operand is of interest,
+ // as it is relatively expensive.
+ HloSharding alternative_sub_sharding =
+ user.sharding().GetSubSharding(user.shape(), {i});
+ if (hlo_sharding_util::IsShardingMoreSpecific(
+ alternative_sub_sharding, sub_sharding)) {
+ sub_sharding = alternative_sub_sharding;
+ }
+}
+
+Don't update stats eagerly; compute them on demand.
+Do not update stats on the very frequent allocation/deallocation calls. Instead, +compute stats on demand when the much less frequently called Stats() method is +invoked.
+ +Preallocate 10 nodes not 200 for query handling in Google's +web server.
+A simple change that reduced web server's CPU usage by 7.5%.
+querytree.h
+ +static const int kInitParseTreeSize = 200; // initial size of querynode pool
+
+static const int kInitParseTreeSize = 10; // initial size of querynode pool
+
+Change search order for 19% throughput improvement.
+An old search system (circa 2000) had two tiers: one contained a full-text +index, and the other tier contained just the index for the title and anchor +terms. We used to search the smaller title/anchor tier first. +Counter-intuitively, we found that it is cheaper to search the larger full-text +index tier first since if we reach the end of the full-text tier, we can +entirely skip searching the title/anchor tier (a subset of the full-text tier). +This happened reasonably often and allowed us to reduce the average number of +disk seeks to process a query.
+See discussion of title and anchor text handling in +The Anatomy of a Large-Scale Hypertextual Web Search Engine +for background information.
+ +Custom printing code for Histogram class is 4x as fast as +sprintf.
+This code is performance sensitive because it is invoked when monitoring systems +gather statistics from various servers.
+histogram_export.cc
+ +void Histogram::PopulateBuckets(const string &prefix,
+ expvar::MapProto *const var) const {
+ ...
+ for (int i = min_bucket; i <= max_bucket; ++i) {
+ const double count = BucketCount(i);
+ if (!export_empty_buckets && count == 0.0) continue;
+ acc += count;
+ // The label format of exported buckets for discrete histograms
+ // specifies an inclusive upper bound, which is the same as in
+ // the original Histogram implementation. This format is not
+ // applicable to non-discrete histograms, so a half-open interval
+ // is used for them, with "_" instead of "-" as a separator to
+ // make possible to distinguish the formats.
+ string key =
+ options_.export_cumulative_counts() ?
+ StringPrintf("%.12g", boundaries_->BucketLimit(i)) :
+ options_.discrete() ?
+ StringPrintf("%.0f-%.0f",
+ ceil(boundaries_->BucketStart(i)),
+ ceil(boundaries_->BucketLimit(i)) - 1.0) :
+ StringPrintf("%.12g_%.12g",
+ boundaries_->BucketStart(i),
+ boundaries_->BucketLimit(i));
+ EscapeMapKey(&key);
+ const double value = options_.export_cumulative_counts() ? acc : count;
+ expvar::AddMapFloat(StrCat(prefix,
+ options_.export_bucket_key_prefix(),
+ key),
+ value * count_mult,
+ var);
+ }
+
+// Format "val" according to format. If "need_escape" is true, then the
+// format can produce output with a '.' in it, and the result will be escaped.
+// If "need_escape" is false, then the caller guarantees that format is
+// such that the resulting number will not have any '.' characters and
+// therefore we can avoid calling EscapeKey.
+// The function is free to use "*scratch" for scratch space if necessary,
+// and the resulting StringPiece may point into "*scratch".
+static StringPiece FormatNumber(const char* format,
+ bool need_escape,
+ double val, string* scratch) {
+ // This routine is specialized to work with only a limited number of formats
+ DCHECK(StringPiece(format) == "%.0f" || StringPiece(format) == "%.12g");
+
+ scratch->clear();
+ if (val == trunc(val) && val >= kint32min && val <= kint32max) {
+ // An integer for which we can just use StrAppend
+ StrAppend(scratch, static_cast<int32>(val));
+ return StringPiece(*scratch);
+ } else if (isinf(val)) {
+ // Infinity, represent as just 'inf'.
+ return StringPiece("inf", 3);
+ } else {
+ // Format according to "format", and possibly escape.
+ StringAppendF(scratch, format, val);
+ if (need_escape) {
+ EscapeMapKey(scratch);
+ } else {
+ DCHECK(!StringPiece(*scratch).contains("."));
+ }
+ return StringPiece(*scratch);
+ }
+}
+...
+void Histogram::PopulateBuckets(const string &prefix,
+ expvar::MapProto *const var) const {
+ ...
+ const string full_key_prefix = StrCat(prefix,
+ options_.export_bucket_key_prefix());
+ string key = full_key_prefix; // Keys will start with "full_key_prefix".
+ string start_scratch;
+ string limit_scratch;
+ const bool cumul_counts = options_.export_cumulative_counts();
+ const bool discrete = options_.discrete();
+ for (int i = min_bucket; i <= max_bucket; ++i) {
+ const double count = BucketCount(i);
+ if (!export_empty_buckets && count == 0.0) continue;
+ acc += count;
+ // The label format of exported buckets for discrete histograms
+ // specifies an inclusive upper bound, which is the same as in
+ // the original Histogram implementation. This format is not
+ // applicable to non-discrete histograms, so a half-open interval
+ // is used for them, with "_" instead of "-" as a separator to
+ // make possible to distinguish the formats.
+ key.resize(full_key_prefix.size()); // Start with full_key_prefix.
+ DCHECK_EQ(key, full_key_prefix);
+
+ const double limit = boundaries_->BucketLimit(i);
+ if (cumul_counts) {
+ StrAppend(&key, FormatNumber("%.12g", true, limit, &limit_scratch));
+ } else {
+ const double start = boundaries_->BucketStart(i);
+ if (discrete) {
+ StrAppend(&key,
+ FormatNumber("%.0f", false, ceil(start), &start_scratch),
+ "-",
+ FormatNumber("%.0f", false, ceil(limit) - 1.0,
+ &limit_scratch));
+ } else {
+ StrAppend(&key,
+ FormatNumber("%.12g", true, start, &start_scratch),
+ "_",
+ FormatNumber("%.12g", true, limit, &limit_scratch));
+ }
+ }
+ const double value = cumul_counts ? acc : count;
+
+ // Add to map var
+ expvar::AddMapFloat(key, value * count_mult, var);
+ }
+}
+
+Add specializations for VLOG(1), VLOG(2), … for speed and +smaller code size.
+VLOG is a heavily used macro throughout the code base. This change avoids
+passing an extra integer constant at nearly every call site (if the log level is
+constant at the call site, as it almost always is, as in VLOG(1) << ...),
+which saves code space.
vlog_is_on.h
+ +class VLogSite final {
+ public:
+ ...
+ bool IsEnabled(int level) {
+ int stale_v = v_.load(std::memory_order_relaxed);
+ if (ABSL_PREDICT_TRUE(level > stale_v)) {
+ return false;
+ }
+
+ // We put everything other than the fast path, i.e. vlogging is initialized
+ // but not on, behind an out-of-line function to reduce code size.
+ return SlowIsEnabled(stale_v, level);
+ }
+ ...
+ private:
+ ...
+ ABSL_ATTRIBUTE_NOINLINE
+ bool SlowIsEnabled(int stale_v, int level);
+ ...
+};
+
+class VLogSite final {
+ public:
+ ...
+ bool IsEnabled(int level) {
+ int stale_v = v_.load(std::memory_order_relaxed);
+ if (ABSL_PREDICT_TRUE(level > stale_v)) {
+ return false;
+ }
+
+ // We put everything other than the fast path, i.e. vlogging is initialized
+ // but not on, behind an out-of-line function to reduce code size.
+ // "level" is almost always a call-site constant, so we can save a bit
+ // of code space by special-casing for levels 1, 2, and 3.
+#if defined(__has_builtin) && __has_builtin(__builtin_constant_p)
+ if (__builtin_constant_p(level)) {
+ if (level == 0) return SlowIsEnabled0(stale_v);
+ if (level == 1) return SlowIsEnabled1(stale_v);
+ if (level == 2) return SlowIsEnabled2(stale_v);
+ if (level == 3) return SlowIsEnabled3(stale_v);
+ if (level == 4) return SlowIsEnabled4(stale_v);
+ if (level == 5) return SlowIsEnabled5(stale_v);
+ }
+#endif
+ return SlowIsEnabled(stale_v, level);
+ ...
+ private:
+ ...
+ ABSL_ATTRIBUTE_NOINLINE
+ bool SlowIsEnabled(int stale_v, int level);
+ ABSL_ATTRIBUTE_NOINLINE bool SlowIsEnabled0(int stale_v);
+ ABSL_ATTRIBUTE_NOINLINE bool SlowIsEnabled1(int stale_v);
+ ABSL_ATTRIBUTE_NOINLINE bool SlowIsEnabled2(int stale_v);
+ ABSL_ATTRIBUTE_NOINLINE bool SlowIsEnabled3(int stale_v);
+ ABSL_ATTRIBUTE_NOINLINE bool SlowIsEnabled4(int stale_v);
+ ABSL_ATTRIBUTE_NOINLINE bool SlowIsEnabled5(int stale_v);
+ ...
+};
+
+vlog_is_on.cc
+ +bool VLogSite::SlowIsEnabled0(int stale_v) { return SlowIsEnabled(stale_v, 0); }
+bool VLogSite::SlowIsEnabled1(int stale_v) { return SlowIsEnabled(stale_v, 1); }
+bool VLogSite::SlowIsEnabled2(int stale_v) { return SlowIsEnabled(stale_v, 2); }
+bool VLogSite::SlowIsEnabled3(int stale_v) { return SlowIsEnabled(stale_v, 3); }
+bool VLogSite::SlowIsEnabled4(int stale_v) { return SlowIsEnabled(stale_v, 4); }
+bool VLogSite::SlowIsEnabled5(int stale_v) { return SlowIsEnabled(stale_v, 5); }
+
+Replace RE2 call with a simple prefix match when possible.
+read_matcher.cc
+ +enum MatchItemType {
+ MATCH_TYPE_INVALID,
+ MATCH_TYPE_RANGE,
+ MATCH_TYPE_EXACT,
+ MATCH_TYPE_REGEXP,
+};
+
+enum MatchItemType {
+ MATCH_TYPE_INVALID,
+ MATCH_TYPE_RANGE,
+ MATCH_TYPE_EXACT,
+ MATCH_TYPE_REGEXP,
+ MATCH_TYPE_PREFIX, // Special type for regexp ".*"
+};
+
+read_matcher.cc
+ +p->type = MATCH_TYPE_REGEXP;
+
+term.NonMetaPrefix().CopyToString(&p->prefix);
+if (term.RegexpSuffix() == ".*") {
+ // Special case for a regexp that matches anything, so we can
+ // bypass RE2::FullMatch
+ p->type = MATCH_TYPE_PREFIX;
+} else {
+ p->type = MATCH_TYPE_REGEXP;
+
+Use StrCat rather than StringPrintf to format IP +addresses.
+ipaddress.cc
+ +string IPAddress::ToString() const {
+ char buf[INET6_ADDRSTRLEN];
+
+ switch (address_family_) {
+ case AF_INET:
+ CHECK(inet_ntop(AF_INET, &addr_.addr4, buf, INET6_ADDRSTRLEN) != NULL);
+ return buf;
+ case AF_INET6:
+ CHECK(inet_ntop(AF_INET6, &addr_.addr6, buf, INET6_ADDRSTRLEN) != NULL);
+ return buf;
+ case AF_UNSPEC:
+ LOG(DFATAL) << "Calling ToString() on an empty IPAddress";
+ return "";
+ default:
+ LOG(FATAL) << "Unknown address family " << address_family_;
+ }
+}
+...
+string IPAddressToURIString(const IPAddress& ip) {
+ switch (ip.address_family()) {
+ case AF_INET6:
+ return StringPrintf("[%s]", ip.ToString().c_str());
+ default:
+ return ip.ToString();
+ }
+}
+...
+string SocketAddress::ToString() const {
+ return IPAddressToURIString(host_) + StringPrintf(":%u", port_);
+}
+
+string IPAddress::ToString() const {
+ char buf[INET6_ADDRSTRLEN];
+
+ switch (address_family_) {
+ case AF_INET: {
+ uint32 addr = gntohl(addr_.addr4.s_addr);
+ int a1 = static_cast<int>((addr >> 24) & 0xff);
+ int a2 = static_cast<int>((addr >> 16) & 0xff);
+ int a3 = static_cast<int>((addr >> 8) & 0xff);
+ int a4 = static_cast<int>(addr & 0xff);
+ return StrCat(a1, ".", a2, ".", a3, ".", a4);
+ }
+ case AF_INET6:
+ CHECK(inet_ntop(AF_INET6, &addr_.addr6, buf, INET6_ADDRSTRLEN) != NULL);
+ return buf;
+ case AF_UNSPEC:
+ LOG(DFATAL) << "Calling ToString() on an empty IPAddress";
+ return "";
+ default:
+ LOG(FATAL) << "Unknown address family " << address_family_;
+ }
+}
+...
+string IPAddressToURIString(const IPAddress& ip) {
+ switch (ip.address_family()) {
+ case AF_INET6:
+ return StrCat("[", ip.ToString(), "]");
+ default:
+ return ip.ToString();
+ }
+}
+...
+string SocketAddress::ToString() const {
+ return StrCat(IPAddressToURIString(host_), ":", port_);
+}
+
+Cache based on precomputed fingerprint of large +serialized proto.
+dp_ops.cc
+ +InputOutputMappingProto mapping_proto;
+PLAQUE_OP_REQUIRES(
+ mapping_proto.ParseFromStringPiece(GetAttrMappingProto(state)),
+ absl::InternalError("Failed to parse InputOutputMappingProto"));
+ParseMapping(mapping_proto);
+
+uint64 mapping_proto_fp = GetAttrMappingProtoFp(state);
+{
+ absl::MutexLock l(&fp_to_iometa_mu);
+ if (fp_to_iometa == nullptr) {
+ fp_to_iometa =
+ new absl::flat_hash_map<uint64, std::unique_ptr<ProgramIOMetadata>>;
+ }
+ auto it = fp_to_iometa->find(mapping_proto_fp);
+ if (it != fp_to_iometa->end()) {
+ io_metadata_ = it->second.get();
+ } else {
+ auto serial_proto = GetAttrMappingProto(state);
+ DCHECK_EQ(mapping_proto_fp, Fingerprint(serial_proto));
+ InputOutputMappingProto mapping_proto;
+ PLAQUE_OP_REQUIRES(
+ mapping_proto.ParseFromStringPiece(GetAttrMappingProto(state)),
+ absl::InternalError("Failed to parse InputOutputMappingProto"));
+ auto io_meta = ParseMapping(mapping_proto);
+ io_metadata_ = io_meta.get();
+ (*fp_to_iometa)[mapping_proto_fp] = std::move(io_meta);
+ }
+}
+
+Speed up ShapeUtil::ForEachState by replacing absl::Span +with raw pointers to the underlying arrays.
+shape_util.h
+ +struct ForEachState {
+ ForEachState(const Shape& s, absl::Span<const int64_t> b,
+ absl::Span<const int64_t> c, absl::Span<const int64_t> i);
+ ~ForEachState();
+
+ const Shape& shape;
+ const absl::Span<const int64_t> base;
+ const absl::Span<const int64_t> count;
+ const absl::Span<const int64_t> incr;
+
+struct ForEachState {
+ ForEachState(const Shape& s, absl::Span<const int64_t> b,
+ absl::Span<const int64_t> c, absl::Span<const int64_t> i);
+ inline ~ForEachState() = default;
+
+ const Shape& shape;
+ // Pointers to arrays of the passed-in spans
+ const int64_t* const base;
+ const int64_t* const count;
+ const int64_t* const incr;
+
+Hand unroll +cyclic +redundancy check (CRC) computation loop.
+crc.cc
+ +void CRC32::Extend(uint64 *lo, uint64 *hi, const void *bytes, size_t length)
+ const {
+ ...
+ // Process bytes 4 at a time
+ while ((p + 4) <= e) {
+ uint32 c = l ^ WORD(p);
+ p += 4;
+ l = this->table3_[c & 0xff] ^
+ this->table2_[(c >> 8) & 0xff] ^
+ this->table1_[(c >> 16) & 0xff] ^
+ this->table0_[c >> 24];
+ }
+
+ // Process the last few bytes
+ while (p != e) {
+ int c = (l & 0xff) ^ *p++;
+ l = this->table0_[c] ^ (l >> 8);
+ }
+ *lo = l;
+}
+
+void CRC32::Extend(uint64 *lo, uint64 *hi, const void *bytes, size_t length)
+ const {
+ ...
+#define STEP { \
+ uint32 c = l ^ WORD(p); \
+ p += 4; \
+ l = this->table3_[c & 0xff] ^ \
+ this->table2_[(c >> 8) & 0xff] ^ \
+ this->table1_[(c >> 16) & 0xff] ^ \
+ this->table0_[c >> 24]; \
+}
+
+ // Process bytes 16 at a time
+ while ((e-p) >= 16) {
+ STEP;
+ STEP;
+ STEP;
+ STEP;
+ }
+
+ // Process bytes 4 at a time
+ while ((p + 4) <= e) {
+ STEP;
+ }
+#undef STEP
+
+ // Process the last few bytes
+ while (p != e) {
+ int c = (l & 0xff) ^ *p++;
+ l = this->table0_[c] ^ (l >> 8);
+ }
+ *lo = l;
+}
+
+
+Handle four characters at a time when parsing Spanner +keys.
+Hand unroll loop to deal with four characters at a time rather than using +memchr
+Manually unroll loop for finding separated sections of name
+Go backwards to find separated portions of a name with '#' separators +(rather than forwards) since the first part is likely the longest in the +name.
+key.cc
+ +void Key::InitSeps(const char* start) {
+ const char* base = &rep_[0];
+ const char* limit = base + rep_.size();
+ const char* s = start;
+
+ DCHECK_GE(s, base);
+ DCHECK_LT(s, limit);
+
+ for (int i = 0; i < 3; i++) {
+ s = (const char*)memchr(s, '#', limit - s);
+ DCHECK(s != NULL);
+ seps_[i] = s - base;
+ s++;
+ }
+}
+
+inline const char* ScanBackwardsForSep(const char* base, const char* p) {
+ while (p >= base + 4) {
+ if (p[0] == '#') return p;
+ if (p[-1] == '#') return p-1;
+ if (p[-2] == '#') return p-2;
+ if (p[-3] == '#') return p-3;
+ p -= 4;
+ }
+ while (p >= base && *p != '#') p--;
+ return p;
+}
+
+void Key::InitSeps(const char* start) {
+ const char* base = &rep_[0];
+ const char* limit = base + rep_.size();
+ const char* s = start;
+
+ DCHECK_GE(s, base);
+ DCHECK_LT(s, limit);
+
+ // We go backwards from the end of the string, rather than forwards,
+ // since the directory name might be long and definitely doesn't contain
+ // any '#' characters.
+ const char* p = ScanBackwardsForSep(s, limit - 1);
+ DCHECK(*p == '#');
+ seps_[2] = p - base;
+ p--;
+
+ p = ScanBackwardsForSep(s, p);
+ DCHECK(*p == '#');
+ seps_[1] = p - base;
+ p--;
+
+ p = ScanBackwardsForSep(s, p);
+ DCHECK(*p == '#');
+ seps_[0] = p - base;
+}
+
+Avoid frame setup costs by converting ABSL_LOG(FATAL) to +ABSL_DCHECK(false).
+arena_cleanup.h
+ +inline ABSL_ATTRIBUTE_ALWAYS_INLINE size_t Size(Tag tag) {
+ if (!EnableSpecializedTags()) return sizeof(DynamicNode);
+
+ switch (tag) {
+ case Tag::kDynamic:
+ return sizeof(DynamicNode);
+ case Tag::kString:
+ return sizeof(TaggedNode);
+ case Tag::kCord:
+ return sizeof(TaggedNode);
+ default:
+ ABSL_LOG(FATAL) << "Corrupted cleanup tag: " << static_cast<int>(tag);
+ return sizeof(DynamicNode);
+ }
+}
+
+inline ABSL_ATTRIBUTE_ALWAYS_INLINE size_t Size(Tag tag) {
+ if (!EnableSpecializedTags()) return sizeof(DynamicNode);
+
+ switch (tag) {
+ case Tag::kDynamic:
+ return sizeof(DynamicNode);
+ case Tag::kString:
+ return sizeof(TaggedNode);
+ case Tag::kCord:
+ return sizeof(TaggedNode);
+ default:
+ ABSL_DCHECK(false) << "Corrupted cleanup tag: " << static_cast<int>(tag);
+ return sizeof(DynamicNode);
+ }
+}
+
+Stop maintaining expensive stats about number of alarms and +closures in SelectServer.
+Part of changes that reduce time for setting an alarm from 771 ns to 271 ns.
+selectserver.h
+ +class SelectServer {
+ public:
+ ...
+ protected:
+ ...
+ scoped_ptr<MinuteTenMinuteHourStat> num_alarms_stat_;
+ ...
+ scoped_ptr<MinuteTenMinuteHourStat> num_closures_stat_;
+ ...
+};
+
+// Selectserver class
+class SelectServer {
+ ...
+ protected:
+ ...
+};
+
+/selectserver.cc
+ +void SelectServer::AddAlarmInternal(Alarmer* alarmer,
+ int offset_in_ms,
+ int id,
+ bool is_periodic) {
+ ...
+ alarms_->insert(alarm);
+ num_alarms_stat_->IncBy(1);
+ ...
+}
+
+void SelectServer::AddAlarmInternal(Alarmer* alarmer,
+ int offset_in_ms,
+ int id,
+ bool is_periodic) {
+ ...
+ alarms_->Add(alarm);
+ ...
+}
+
+/selectserver.cc
+ +void SelectServer::RemoveAlarm(Alarmer* alarmer, int id) {
+ ...
+ alarms_->erase(alarm);
+ num_alarms_stat_->IncBy(-1);
+ ...
+}
+
+void SelectServer::RemoveAlarm(Alarmer* alarmer, int id) {
+ ...
+ alarms_->Remove(alarm);
+ ...
+}
+
+Maintain stats for just a sample of doc info requests.
+Sampling allows us to avoid touching 39 histograms and MinuteTenMinuteHour stats +for most requests.
+generic-leaf-stats.cc
+... code that touches 39 histograms to update various stats on every request ...
+
+// Add to the histograms periodically
+if (TryLockToUpdateHistogramsDocInfo(docinfo_stats, bucket)) {
+ // Returns true and grabs bucket->lock only if we should sample this
+ // request for maintaining stats
+ ... code that touches 39 histograms to update various stats ...
+ bucket->lock.Unlock();
+}
+
+
+Reduce sampling rate and make faster sampling decisions.
+This change reduces the sampling rate from 1 in 10 to 1 in 32. Furthermore, we +now keep execution time stats just for the sampled events and speed up sampling +decisions by using a power of two modulus. This code is called on every packet +in the Google Meet video conferencing system and needed performance work to keep +up with capacity demands during the first part of the COVID outbreak as users +rapidly migrated to doing more online meetings.
+packet_executor.cc
+ +class ScopedPerformanceMeasurement {
+ public:
+ explicit ScopedPerformanceMeasurement(PacketExecutor* packet_executor)
+ : packet_executor_(packet_executor),
+ tracer_(packet_executor->packet_executor_trace_threshold_,
+ kClosureTraceName) {
+ // ThreadCPUUsage is an expensive call. At the time of writing,
+ // it takes over 400ns, or roughly 30 times slower than absl::Now,
+ // so we sample only 10% of closures to keep the cost down.
+ if (packet_executor->closures_executed_ % 10 == 0) {
+ thread_cpu_usage_start_ = base::ThreadCPUUsage();
+ }
+
+ // Sample start time after potentially making the above expensive call,
+ // so as not to pollute wall time measurements.
+ run_start_time_ = absl::Now();
+ }
+
+ ~ScopedPerformanceMeasurement() {
+
+ScopedPerformanceMeasurement::ScopedPerformanceMeasurement(
+ PacketExecutor* packet_executor)
+ : packet_executor_(packet_executor),
+ tracer_(packet_executor->packet_executor_trace_threshold_,
+ kClosureTraceName) {
+ // ThreadCPUUsage is an expensive call. At the time of writing,
+ // it takes over 400ns, or roughly 30 times slower than absl::Now,
+ // so we sample only 1 in 32 closures to keep the cost down.
+ if (packet_executor->closures_executed_ % 32 == 0) {
+ thread_cpu_usage_start_ = base::ThreadCPUUsage();
+ }
+
+ // Sample start time after potentially making the above expensive call,
+ // so as not to pollute wall time measurements.
+ run_start_time_ = absl::Now();
+}
+
+packet_executor.cc
+ +~ScopedPerformanceMeasurement() {
+ auto run_end_time = absl::Now();
+ auto run_duration = run_end_time - run_start_time_;
+
+ if (thread_cpu_usage_start_.has_value()) {
+ ...
+ }
+
+ closure_execution_time->Record(absl::ToInt64Microseconds(run_duration));
+
+ScopedPerformanceMeasurement::~ScopedPerformanceMeasurement() {
+ auto run_end_time = absl::Now();
+ auto run_duration = run_end_time - run_start_time_;
+
+ if (thread_cpu_usage_start_.has_value()) {
+ ...
+ closure_execution_time->Record(absl::ToInt64Microseconds(run_duration));
+ }
+
+Benchmark results:
+Run on (40 X 2793 MHz CPUs); 2020-03-24T20:08:19.991412535-07:00
+CPU: Intel Ivybridge with HyperThreading (20 cores) dL1:32KB dL2:256KB dL3:25MB
+Benchmark Base (ns) New (ns) Improvement
+----------------------------------------------------------------------------
+BM_PacketOverhead_mean 224 85 +62.0%
+
+
+Remove logging from guts of memory allocator.
+This was a small part of a larger change.
+gpu_bfc_allocator.cc
+void GPUBFCAllocator::SplitChunk(...) {
+ ...
+ VLOG(6) << "Adding to chunk map: " << new_chunk->ptr;
+ ...
+}
+...
+void GPUBFCAllocator::DeallocateRawInternal(void* ptr) {
+ ...
+ VLOG(6) << "Chunk at " << c->ptr << " no longer in use";
+ ...
+}
+
+void GPUBFCAllocator::SplitChunk(...) {
+...
+}
+...
+void GPUBFCAllocator::DeallocateRawInternal(void* ptr) {
+...
+}
+
+
+Precompute whether or not logging is enabled outside a +nested loop.
+image_similarity.cc
+ +for (int j = 0; j < output_subimage_size_y; j++) {
+ int j1 = j - rad + output_to_integral_subimage_y;
+ int j2 = j1 + 2 * rad + 1;
+ // Create a pointer for this row's output, taking into account the offset
+ // to the full image.
+ double *image_diff_ptr = &(*image_diff)(j + min_j, min_i);
+
+ for (int i = 0; i < output_subimage_size_x; i++) {
+ ...
+ if (VLOG_IS_ON(3)) {
+ ...
+ }
+ ...
+ }
+}
+
+const bool vlog_3 = DEBUG_MODE ? VLOG_IS_ON(3) : false;
+
+for (int j = 0; j < output_subimage_size_y; j++) {
+ int j1 = j - rad + output_to_integral_subimage_y;
+ int j2 = j1 + 2 * rad + 1;
+ // Create a pointer for this row's output, taking into account the offset
+ // to the full image.
+ double *image_diff_ptr = &(*image_diff)(j + min_j, min_i);
+
+ for (int i = 0; i < output_subimage_size_x; i++) {
+ ...
+ if (vlog_3) {
+ ...
+ }
+ }
+}
+
+Run on (40 X 2801 MHz CPUs); 2016-05-16T15:55:32.250633072-07:00
+CPU: Intel Ivybridge with HyperThreading (20 cores) dL1:32KB dL2:256KB dL3:25MB
+Benchmark Base (ns) New (ns) Improvement
+------------------------------------------------------------------
+BM_NCCPerformance/16 29104 26372 +9.4%
+BM_NCCPerformance/64 473235 425281 +10.1%
+BM_NCCPerformance/512 30246238 27622009 +8.7%
+BM_NCCPerformance/1k 125651445 113361991 +9.8%
+BM_NCCLimitedBoundsPerformance/16 8314 7498 +9.8%
+BM_NCCLimitedBoundsPerformance/64 143508 132202 +7.9%
+BM_NCCLimitedBoundsPerformance/512 9335684 8477567 +9.2%
+BM_NCCLimitedBoundsPerformance/1k 37223897 34201739 +8.1%
+
+
+Precompute whether logging is enabled and use the result +in helper routines.
+periodic_call.cc
+ + VLOG(1) << Logid()
+ << "MaybeScheduleAlarmAtNextTick. Time until next real time: "
+ << time_until_next_real_time;
+ ...
+ uint64 next_virtual_time_ms =
+ next_virtual_time_ms_ - num_ticks * kResolutionMs;
+ CHECK_GE(next_virtual_time_ms, 0);
+ ScheduleAlarm(now, delay, next_virtual_time_ms);
+}
+
+void ScheduleNextAlarm(uint64 current_virtual_time_ms)
+ ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+ if (calls_.empty()) {
+ VLOG(1) << Logid() << "No calls left, entering idle mode";
+ next_real_time_ = absl::InfiniteFuture();
+ return;
+ }
+ uint64 next_virtual_time_ms = FindNextVirtualTime(current_virtual_time_ms);
+ auto delay =
+ absl::Milliseconds(next_virtual_time_ms - current_virtual_time_ms);
+ ScheduleAlarm(GetClock().TimeNow(), delay, next_virtual_time_ms);
+}
+
+// An alarm scheduled by this function supersedes all previously scheduled
+// alarms. This is ensured through `scheduling_sequence_number_`.
+void ScheduleAlarm(absl::Time now, absl::Duration delay,
+ uint64 virtual_time_ms)
+ ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+ next_real_time_ = now + delay;
+ next_virtual_time_ms_ = virtual_time_ms;
+ ++ref_count_; // The Alarm holds a reference.
+ ++scheduling_sequence_number_;
+ VLOG(1) << Logid() << "ScheduleAlarm. Time : "
+ << absl::FormatTime("%M:%S.%E3f", now, absl::UTCTimeZone())
+ << ", delay: " << delay << ", virtual time: " << virtual_time_ms
+ << ", refs: " << ref_count_
+ << ", seq: " << scheduling_sequence_number_
+ << ", executor: " << executor_;
+
+ executor_->AddAfter(
+ delay, new Alarm(this, virtual_time_ms, scheduling_sequence_number_));
+}
+
+ const bool vlog_1 = VLOG_IS_ON(1);
+
+ if (vlog_1) {
+ VLOG(1) << Logid()
+ << "MaybeScheduleAlarmAtNextTick. Time until next real time: "
+ << time_until_next_real_time;
+ }
+ ...
+ uint64 next_virtual_time_ms =
+ next_virtual_time_ms_ - num_ticks * kResolutionMs;
+ CHECK_GE(next_virtual_time_ms, 0);
+ ScheduleAlarm(now, delay, next_virtual_time_ms, vlog_1);
+}
+
+void ScheduleNextAlarm(uint64 current_virtual_time_ms, bool vlog_1)
+ ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+ if (calls_.empty()) {
+ if (vlog_1) {
+ VLOG(1) << Logid() << "No calls left, entering idle mode";
+ }
+ next_real_time_ = absl::InfiniteFuture();
+ return;
+ }
+ uint64 next_virtual_time_ms = FindNextVirtualTime(current_virtual_time_ms);
+ auto delay =
+ absl::Milliseconds(next_virtual_time_ms - current_virtual_time_ms);
+ ScheduleAlarm(GetClock().TimeNow(), delay, next_virtual_time_ms, vlog_1);
+}
+
+// An alarm scheduled by this function supersedes all previously scheduled
+// alarms. This is ensured through `scheduling_sequence_number_`.
+void ScheduleAlarm(absl::Time now, absl::Duration delay,
+ uint64 virtual_time_ms,
+ bool vlog_1)
+ ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+ next_real_time_ = now + delay;
+ next_virtual_time_ms_ = virtual_time_ms;
+ ++ref_count_; // The Alarm holds a reference.
+ ++scheduling_sequence_number_;
+ if (vlog_1) {
+ VLOG(1) << Logid() << "ScheduleAlarm. Time : "
+ << absl::FormatTime("%M:%S.%E3f", now, absl::UTCTimeZone())
+ << ", delay: " << delay << ", virtual time: " << virtual_time_ms
+ << ", refs: " << ref_count_
+ << ", seq: " << scheduling_sequence_number_
+ << ", executor: " << executor_;
+ }
+
+ executor_->AddAfter(
+ delay, new Alarm(this, virtual_time_ms, scheduling_sequence_number_));
+}
+
+Speed up TF_CHECK_OK.
+Avoid creating Ok object, and save code space by doing complex formatting of +fatal error message out of line instead of at every call site.
+status.h
+ +#define TF_CHECK_OK(val) CHECK_EQ(::tensorflow::Status::OK(), (val))
+#define TF_QCHECK_OK(val) QCHECK_EQ(::tensorflow::Status::OK(), (val))
+
+extern tensorflow::string* TfCheckOpHelperOutOfLine(
+ const ::tensorflow::Status& v, const char* msg);
+inline tensorflow::string* TfCheckOpHelper(::tensorflow::Status v,
+ const char* msg) {
+ if (v.ok()) return nullptr;
+ return TfCheckOpHelperOutOfLine(v, msg);
+}
+#define TF_CHECK_OK(val) \
+ while (tensorflow::string* _result = TfCheckOpHelper(val, #val)) \
+ LOG(FATAL) << *(_result)
+#define TF_QCHECK_OK(val) \
+ while (tensorflow::string* _result = TfCheckOpHelper(val, #val)) \
+ LOG(QFATAL) << *(_result)
+
+status.cc
+ +string* TfCheckOpHelperOutOfLine(const ::tensorflow::Status& v,
+ const char* msg) {
+ string r("Non-OK-status: ");
+ r += msg;
+ r += " status: ";
+ r += v.ToString();
+ // Leaks string but this is only to be used in a fatal error message
+ return new string(r);
+}
+
+Shrink each RETURN_IF_ERROR call site by 79 bytes of +code.
+Improve performance of CHECK_GE by 4.5X and shrink code +size from 125 bytes to 77 bytes.
+logging.h
+ +struct CheckOpString {
+ CheckOpString(string* str) : str_(str) { }
+ ~CheckOpString() { delete str_; }
+ operator bool() const { return str_ == NULL; }
+ string* str_;
+};
+...
+#define DEFINE_CHECK_OP_IMPL(name, op) \
+ template <class t1, class t2> \
+ inline string* Check##name##Impl(const t1& v1, const t2& v2, \
+ const char* names) { \
+ if (v1 op v2) return NULL; \
+ else return MakeCheckOpString(v1, v2, names); \
+ } \
+ string* Check##name##Impl(int v1, int v2, const char* names);
+DEFINE_CHECK_OP_IMPL(EQ, ==)
+DEFINE_CHECK_OP_IMPL(NE, !=)
+DEFINE_CHECK_OP_IMPL(LE, <=)
+DEFINE_CHECK_OP_IMPL(LT, < )
+DEFINE_CHECK_OP_IMPL(GE, >=)
+DEFINE_CHECK_OP_IMPL(GT, > )
+#undef DEFINE_CHECK_OP_IMPL
+
+struct CheckOpString {
+ CheckOpString(string* str) : str_(str) { }
+ // No destructor: if str_ is non-NULL, we're about to LOG(FATAL),
+ // so there's no point in cleaning up str_.
+ operator bool() const { return str_ == NULL; }
+ string* str_;
+};
+...
+extern string* MakeCheckOpStringIntInt(int v1, int v2, const char* names);
+
+template<int, int>
+string* MakeCheckOpString(const int& v1, const int& v2, const char* names) {
+ return MakeCheckOpStringIntInt(v1, v2, names);
+}
+...
+#define DEFINE_CHECK_OP_IMPL(name, op) \
+ template <class t1, class t2> \
+ inline string* Check##name##Impl(const t1& v1, const t2& v2, \
+ const char* names) { \
+ if (v1 op v2) return NULL; \
+ else return MakeCheckOpString(v1, v2, names); \
+ } \
+ inline string* Check##name##Impl(int v1, int v2, const char* names) { \
+ if (v1 op v2) return NULL; \
+ else return MakeCheckOpString(v1, v2, names); \
+ }
+DEFINE_CHECK_OP_IMPL(EQ, ==)
+DEFINE_CHECK_OP_IMPL(NE, !=)
+DEFINE_CHECK_OP_IMPL(LE, <=)
+DEFINE_CHECK_OP_IMPL(LT, < )
+DEFINE_CHECK_OP_IMPL(GE, >=)
+DEFINE_CHECK_OP_IMPL(GT, > )
+#undef DEFINE_CHECK_OP_IMPL
+
+logging.cc
+ +string* MakeCheckOpStringIntInt(int v1, int v2, const char* names) {
+ strstream ss;
+ ss << names << " (" << v1 << " vs. " << v2 << ")";
+ return new string(ss.str(), ss.pcount());
+}
+
+Reduce inlining in TensorFlow.
+The change stops inlining many non-performance-sensitive functions (e.g., error +paths and op registration code). Furthermore, slow paths of some +performance-sensitive functions are moved into non-inlined functions.
+These changes reduces the size of tensorflow symbols in a typical binary by +12.2% (8814545 bytes down to 7740233 bytes)
+ +Protocol buffer library change. Avoid expensive inlined +code space for encoding message length for messages ≥ 128 bytes and instead +do a procedure call to a shared out-of-line routine.
+Not only makes important large binaries smaller but also faster.
+Bytes of generated code per line of a heavily inlined routine in one large +binary. First number represents the total bytes generated for a particular +source line including all locations where that code has been inlined.
+Before:
+. 0 1825 template <typename MessageType>
+. 0 1826 inline uint8* WireFormatLite::InternalWriteMessage(
+. 0 1827 int field_number, const MessageType& value, uint8* target,
+. 0 1828 io::EpsCopyOutputStream* stream) {
+>>> 389246 1829 target = WriteTagToArray(field_number, WIRETYPE_LENGTH_DELIMITED, target);
+>>> 5454640 1830 target = io::CodedOutputStream::WriteVarint32ToArray(
+>>> 337837 1831 static_cast<uint32>(value.GetCachedSize()), target);
+>>> 1285539 1832 return value._InternalSerialize(target, stream);
+. 0 1833 }
+
+The new codesize output with this change looks like:
+. 0 1825 template <typename MessageType>
+. 0 1826 inline uint8* WireFormatLite::InternalWriteMessage(
+. 0 1827 int field_number, const MessageType& value, uint8* target,
+. 0 1828 io::EpsCopyOutputStream* stream) {
+>>> 450612 1829 target = WriteTagToArray(field_number, WIRETYPE_LENGTH_DELIMITED, target);
+>> 9609 1830 target = io::CodedOutputStream::WriteVarint32ToArrayOutOfLine(
+>>> 434668 1831 static_cast<uint32>(value.GetCachedSize()), target);
+>>> 1597394 1832 return value._InternalSerialize(target, stream);
+. 0 1833 }
+
+coded_stream.h
+ +class PROTOBUF_EXPORT CodedOutputStream {
+ ...
+ // Like WriteVarint32() but writing directly to the target array, and with the
+ // less common-case paths being out of line rather than inlined.
+ static uint8* WriteVarint32ToArrayOutOfLine(uint32 value, uint8* target);
+ ...
+};
+...
+inline uint8* CodedOutputStream::WriteVarint32ToArrayOutOfLine(uint32 value,
+ uint8* target) {
+ target[0] = static_cast<uint8>(value);
+ if (value < 0x80) {
+ return target + 1;
+ } else {
+ return WriteVarint32ToArrayOutOfLineHelper(value, target);
+ }
+}
+
+coded_stream.cc
+ +uint8* CodedOutputStream::WriteVarint32ToArrayOutOfLineHelper(uint32 value,
+ uint8* target) {
+ DCHECK_GE(value, 0x80);
+ target[0] |= static_cast<uint8>(0x80);
+ value >>= 7;
+ target[1] = static_cast<uint8>(value);
+ if (value < 0x80) {
+ return target + 2;
+ }
+ target += 2;
+ do {
+ // Turn on continuation bit in the byte we just wrote.
+ target[-1] |= static_cast<uint8>(0x80);
+ value >>= 7;
+ *target = static_cast<uint8>(value);
+ ++target;
+ } while (value >= 0x80);
+ return target;
+}
+
+Reduce absl::flat_hash_set and absl::flat_hash_map code +size.
+Reduces sizes of some large binaries by ~0.5%.
+ +Do not inline string allocation and deallocation when not +using protobuf arenas.
+public/arenastring.h
+ + if (IsDefault(default_value)) {
+ std::string* new_string = new std::string();
+ tagged_ptr_.Set(new_string);
+ return new_string;
+ } else {
+ return UnsafeMutablePointer();
+ }
+}
+
+ if (IsDefault(default_value)) {
+ return SetAndReturnNewString();
+ } else {
+ return UnsafeMutablePointer();
+ }
+}
+
+internal/arenastring.cc
+ +std::string* ArenaStringPtr::SetAndReturnNewString() {
+ std::string* new_string = new std::string();
+ tagged_ptr_.Set(new_string);
+ return new_string;
+}
+
+Avoid inlining some routines. Create variants of routines +that take 'const char*' rather than 'const std::string&' to avoid std::string +construction code at every call site.
+op.h
+ +class OpDefBuilderWrapper {
+ public:
+ explicit OpDefBuilderWrapper(const char name[]) : builder_(name) {}
+ OpDefBuilderWrapper& Attr(std::string spec) {
+ builder_.Attr(std::move(spec));
+ return *this;
+ }
+ OpDefBuilderWrapper& Input(std::string spec) {
+ builder_.Input(std::move(spec));
+ return *this;
+ }
+ OpDefBuilderWrapper& Output(std::string spec) {
+ builder_.Output(std::move(spec));
+ return *this;
+ }
+
+class OpDefBuilderWrapper {
+ public:
+ explicit OpDefBuilderWrapper(const char name[]) : builder_(name) {}
+ OpDefBuilderWrapper& Attr(std::string spec) {
+ builder_.Attr(std::move(spec));
+ return *this;
+ }
+ OpDefBuilderWrapper& Attr(const char* spec) TF_ATTRIBUTE_NOINLINE {
+ return Attr(std::string(spec));
+ }
+ OpDefBuilderWrapper& Input(std::string spec) {
+ builder_.Input(std::move(spec));
+ return *this;
+ }
+ OpDefBuilderWrapper& Input(const char* spec) TF_ATTRIBUTE_NOINLINE {
+ return Input(std::string(spec));
+ }
+ OpDefBuilderWrapper& Output(std::string spec) {
+ builder_.Output(std::move(spec));
+ return *this;
+ }
+ OpDefBuilderWrapper& Output(const char* spec) TF_ATTRIBUTE_NOINLINE {
+ return Output(std::string(spec));
+ }
+
+Replace template argument with a regular argument.
+Changed a large routine templated on a bool to instead take the bool as an extra +argument. (The bool was only being used once to select one of two string +constants, so a run-time check was just fine.) This reduced the # of +instantiations of the large routine from 287 to 143.
+sharding_util_ops.cc
+ +template <bool Split>
+Status GetAndValidateAttributes(OpKernelConstruction* ctx,
+ std::vector<int32>& num_partitions,
+ int& num_slices, std::vector<int32>& paddings,
+ bool& has_paddings) {
+ absl::string_view num_partitions_attr_name =
+ Split ? kNumSplitsAttrName : kNumConcatsAttrName;
+ ...
+ return OkStatus();
+}
+
+Status GetAndValidateAttributes(bool split, OpKernelConstruction* ctx,
+ std::vector<int32>& num_partitions,
+ int& num_slices, std::vector<int32>& paddings,
+ bool& has_paddings) {
+ absl::string_view num_partitions_attr_name =
+ split ? kNumSplitsAttrName : kNumConcatsAttrName;
+ ...
+ return OkStatus();
+}
+
+Move bulky code from templated constructor to a +non-templated shared base class constructor.
+Also reduce number of template instantiations from one for every combination of
+<T, Device, Rank> to one for every <T> and every <Rank>.
sharding_util_ops.cc
+ +template <typename Device, typename T>
+class XlaSplitNDBaseOp : public OpKernel {
+ public:
+ explicit XlaSplitNDBaseOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+ OP_REQUIRES_OK(
+ ctx, GetAndValidateAttributes(/*split=*/true, ctx, num_splits_,
+ num_slices_, paddings_, has_paddings_));
+ }
+
+// Shared base class to save code space
+class XlaSplitNDShared : public OpKernel {
+ public:
+ explicit XlaSplitNDShared(OpKernelConstruction* ctx) TF_ATTRIBUTE_NOINLINE
+ : OpKernel(ctx),
+ num_slices_(1),
+ has_paddings_(false) {
+ GetAndValidateAttributes(/*split=*/true, ctx, num_splits_, num_slices_,
+ paddings_, has_paddings_);
+ }
+
+Reduce generated code size for absl::flat_hash_set and +absl::flat_hash_map.
+Turn many map insertion calls in a row to initialize a +hash table of emoji characters into a single bulk insert operation (188KB of +text down to 360 bytes in library linked into many binaries). 😊
+textfallback_init.h
+ +inline void AddEmojiFallbacks(TextFallbackMap *map) {
+ (*map)[0xFE000] = &kFE000;
+ (*map)[0xFE001] = &kFE001;
+ (*map)[0xFE002] = &kFE002;
+ (*map)[0xFE003] = &kFE003;
+ (*map)[0xFE004] = &kFE004;
+ (*map)[0xFE005] = &kFE005;
+ ...
+ (*map)[0xFEE7D] = &kFEE7D;
+ (*map)[0xFEEA0] = &kFEEA0;
+ (*map)[0xFE331] = &kFE331;
+};
+
+inline void AddEmojiFallbacks(TextFallbackMap *map) {
+#define PAIR(x) {0x##x, &k##x}
+ // clang-format off
+ map->insert({
+ PAIR(FE000),
+ PAIR(FE001),
+ PAIR(FE002),
+ PAIR(FE003),
+ PAIR(FE004),
+ PAIR(FE005),
+ ...
+ PAIR(FEE7D),
+ PAIR(FEEA0),
+ PAIR(FE331)});
+ // clang-format on
+#undef PAIR
+};
+
+Stop inlining a heavy user of InlinedVector operations.
+Moved very long routine that was being inlined from .h file to .cc (no real +performance benefit from inlining this).
+reduction_ops_common.h
+Status Simplify(const Tensor& data, const Tensor& axis,
+ const bool keep_dims) {
+ ... Eighty line routine body ...
+}
+
+Status Simplify(const Tensor& data, const Tensor& axis, const bool keep_dims);
+
+
+Four-way parallelization improves the rate of encoding +tokens by ~3.6x.
+blocked-token-coder.cc
+ +MutexLock l(&encoder_threads_lock);
+if (encoder_threads == NULL) {
+ encoder_threads = new ThreadPool(NumCPUs());
+ encoder_threads->SetStackSize(262144);
+ encoder_threads->StartWorkers();
+}
+encoder_threads->Add
+ (NewCallback(this,
+ &BlockedTokenEncoder::EncodeRegionInThread,
+ region_tokens, N, region,
+ stats,
+ controller_->GetClosureWithCost
+ (NewCallback(&DummyCallback), N)));
+
+Parallelization improves decoding performance by 5x.
+coding.cc
+for (int c = 0; c < clusters->size(); c++) {
+ RET_CHECK_OK(DecodeBulkForCluster(...);
+}
+
+struct SubTask {
+ absl::Status result;
+ absl::Notification done;
+};
+
+std::vector<SubTask> tasks(clusters->size());
+for (int c = 0; c < clusters->size(); c++) {
+ options_.executor->Schedule([&, c] {
+ tasks[c].result = DecodeBulkForCluster(...);
+ tasks[c].done.Notify();
+ });
+}
+for (int c = 0; c < clusters->size(); c++) {
+ tasks[c].done.WaitForNotification();
+}
+for (int c = 0; c < clusters->size(); c++) {
+ RETURN_IF_ERROR(tasks[c].result);
+}
+
+
+Acquire lock once to free entire tree of query nodes, rather +than reacquiring lock for every node in tree.
+mustang-query.cc
+ +// Pool of query nodes
+ThreadSafeFreeList<MustangQuery> pool_(256);
+...
+void MustangQuery::Release(MustangQuery* node) {
+ if (node == NULL)
+ return;
+ for (int i=0; i < node->children_->size(); ++i)
+ Release((*node->children_)[i]);
+ node->children_->clear();
+ pool_.Delete(node);
+}
+
+// Pool of query nodes
+Mutex pool_lock_;
+FreeList<MustangQuery> pool_(256);
+...
+void MustangQuery::Release(MustangQuery* node) {
+ if (node == NULL)
+ return;
+ MutexLock l(&pool_lock_);
+ ReleaseLocked(node);
+}
+
+void MustangQuery::ReleaseLocked(MustangQuery* node) {
+#ifndef NDEBUG
+ pool_lock_.AssertHeld();
+#endif
+ if (node == NULL)
+ return;
+ for (int i=0; i < node->children_->size(); ++i)
+ ReleaseLocked((*node->children_)[i]);
+ node->children_->clear();
+ pool_.Delete(node);
+}
+
+Reduce number of cache lines touched in critical section.
+Careful data structure adjustments reduce the number of cache lines accessed +significantly and improve the performance of an ML training run by 3.3%.
+~2 + O(num_outgoing edges)
+(and for large graphs with many cores executing them there is also less TLB
+pressure).Avoid RPC while holding Mutex.
+trainer.cc
+ +{
+ // Notify the parameter server that we are starting.
+ MutexLock l(&lock_);
+ model_ = model;
+ MaybeRecordProgress(last_global_step_);
+}
+
+bool should_start_record_progress = false;
+int64 step_for_progress = -1;
+{
+ // Notify the parameter server that we are starting.
+ MutexLock l(&lock_);
+ model_ = model;
+ should_start_record_progress = ShouldStartRecordProgress();
+ step_for_progress = last_global_step_;
+}
+if (should_start_record_progress) {
+ StartRecordProgress(step_for_progress);
+}
+
+Shard a cache 16 ways which improves throughput under a +multi-threaded load by ~2x.
+cache.cc
+ +class ShardedLRUCache : public Cache {
+ private:
+ LRUCache shard_[kNumShards];
+ port::Mutex id_mutex_;
+ uint64_t last_id_;
+
+ static inline uint32_t HashSlice(const Slice& s) {
+ return Hash(s.data(), s.size(), 0);
+ }
+
+ static uint32_t Shard(uint32_t hash) {
+ return hash >> (32 - kNumShardBits);
+ }
+ ...
+ virtual Handle* Lookup(const Slice& key) {
+ const uint32_t hash = HashSlice(key);
+ return shard_[Shard(hash)].Lookup(key, hash);
+ }
+
+Shard spanner data structure for tracking calls.
+transaction_manager.cc
+ +absl::MutexLock l(&active_calls_in_mu_);
+ActiveCallMap::const_iterator iter = active_calls_in_.find(m->tid());
+if (iter != active_calls_in_.end()) {
+ iter->second.ExtractElements(&m->tmp_calls_);
+}
+
+ActiveCalls::LockedShard shard(active_calls_in_, m->tid());
+const ActiveCallMap& active_calls_map = shard.active_calls_map();
+ActiveCallMap::const_iterator iter = active_calls_map.find(m->tid());
+if (iter != active_calls_map.end()) {
+ iter->second.ExtractElements(&m->tmp_calls_);
+}
+
+Fix information used for shard selection to prevent hash +table issues.
+netmon_map_impl.h
+ +ConnectionBucket* GetBucket(Index index) {
+ // Rehash the hash to make sure we are not partitioning the buckets based on
+ // the original hash. If num_buckets_ is a power of 2 that would drop the
+ // entropy of the buckets.
+ size_t original_hash = absl::Hash<Index>()(index);
+ int hash = absl::Hash<size_t>()(original_hash) % num_buckets_;
+ return &buckets_[hash];
+}
+
+ConnectionBucket* GetBucket(Index index) {
+ absl::Hash<std::pair<Index, size_t>> hasher{};
+ // Combine the hash with 42 to prevent shard selection using the same bits
+ // as the underlying hashtable.
+ return &buckets_[hasher({index, 42}) % num_buckets_];
+}
+
+Shard Spanner data structure used for tracking calls.
+This CL partitions the ActiveCallMap into 64 shards. Each shard is protected by +a separate mutex. A given transaction will be mapped to exactly one shard. A new +interface LockedShard(tid) is added for accessing the ActiveCallMap for a +transaction in a thread-safe manner. Example usage:
+transaction_manager.cc
+ +{
+ absl::MutexLock l(&active_calls_in_mu_);
+ delayed_locks_timer_ring_.Add(delayed_locks_flush_time_ms, tid);
+}
+
+{
+ ActiveCalls::LockedShard shard(active_calls_in_, tid);
+ shard.delayed_locks_timer_ring().Add(delayed_locks_flush_time_ms, tid);
+}
+
+The results show a 69% reduction in overall wall-clock time when running the +benchmark with 8192 fibers
+Benchmark Time(ns) CPU(ns) Iterations
+------------------------------------------------------------------
+BM_ActiveCalls/8k 11854633492 98766564676 10
+BM_ActiveCalls/16k 26356203552 217325836709 10
+
+Benchmark Time(ns) CPU(ns) Iterations
+------------------------------------------------------------------
+BM_ActiveCalls/8k 3696794642 39670670110 10
+BM_ActiveCalls/16k 7366284437 79435705713 10
+
+
+Segregate commonly mutated fields in a different cache +line than other fields.
+histogram.h
+ +HistogramOptions options_;
+...
+internal::HistogramBoundaries *boundaries_;
+...
+std::vector<double> buckets_;
+
+double min_; // Minimum.
+double max_; // Maximum.
+double count_; // Total count of occurrences.
+double sum_; // Sum of values.
+double sum_of_squares_; // Sum of squares of values.
+...
+RegisterVariableExporter *exporter_;
+
+ HistogramOptions options_;
+ ...
+ internal::HistogramBoundaries *boundaries_;
+ ...
+ RegisterVariableExporter *exporter_;
+ ...
+ // Place the following fields in a dedicated cacheline as they are frequently
+ // mutated, so we can avoid potential false sharing.
+ ...
+#ifndef SWIG
+ alignas(ABSL_CACHELINE_SIZE)
+#endif
+ std::vector<double> buckets_;
+
+ double min_; // Minimum.
+ double max_; // Maximum.
+ double count_; // Total count of occurrences.
+ double sum_; // Sum of values.
+ double sum_of_squares_; // Sum of squares of values.
+
+Process small work items inline instead of on device +thread pool.
+cast_op.cc
+ +template <typename Device, typename Tout, typename Tin>
+void CastMaybeInline(const Device& d, typename TTypes<Tout>::Flat o,
+ typename TTypes<Tin>::ConstFlat i) {
+ if (o.size() * (sizeof(Tin) + sizeof(Tout)) < 16384) {
+ // Small cast on a CPU: do inline
+ o = i.template cast<Tout>();
+ } else {
+ o.device(d) = i.template cast<Tout>();
+ }
+}
+
+Use lock-free map to manage a cache of RPC channels.
+Entries in an RPC stub cache are read thousands of times a second and modified +rarely. Switching to an appropriate lock-free map reduces search latency by +3%-5%.
+ +Use a fixed lexicon+lock-free hash map to speed-up +determining IsValidTokenId.
+dynamic_token_class_manager.h
+ +mutable Mutex mutex_;
+
+// The density of this hash map is guaranteed by the fact that the
+// dynamic lexicon reuses previously allocated TokenIds before trying
+// to allocate new ones.
+dense_hash_map<TokenId, common::LocalTokenClassId> tid_to_cid_
+ GUARDED_BY(mutex_);
+
+// Read accesses to this hash-map should be done using
+// 'epoch_gc_'::(EnterFast / LeaveFast). The writers should periodically
+// GC the deleted entries, by simply invoking LockFreeHashMap::CreateGC.
+typedef util::gtl::LockFreeHashMap<TokenId, common::LocalTokenClassId>
+ TokenIdTokenClassIdMap;
+TokenIdTokenClassIdMap tid_to_cid_;
+
+Benchmark code for both versions.
+name old time/op new time/op delta
+BenchmarkIteration 17.4µs ± 5% 0.8µs ± 1% -95.30% (p=0.000 n=11+12)
+
+Protobuf version:
+message PointProto {
+ int32 x = 1;
+ int32 y = 2;
+}
+message PointListProto {
+ repeated PointProto points = 1;
+}
+
+void SumProto(const PointListProto& vec) {
+ int sum = 0;
+ for (const PointProto& p : vec.points()) {
+ sum += p.y();
+ }
+ ABSL_VLOG(1) << sum;
+}
+
+void BenchmarkIteration() {
+ PointListProto points;
+ points.mutable_points()->Reserve(1000);
+ for (int i = 0; i < 1000; i++) {
+ PointProto* p = points.add_points();
+ p->set_x(i);
+ p->set_y(i * 2);
+ }
+ SumProto(points);
+}
+
+Non-protobuf version:
+struct PointStruct {
+ int x;
+ int y;
+};
+
+void SumVector(const std::vector<PointStruct>& vec) {
+ int sum = 0;
+ for (const PointStruct& p : vec) {
+ sum += p.y;
+ }
+ ABSL_VLOG(1) << sum;
+}
+
+void BenchmarkIteration() {
+ std::vector<PointStruct> points;
+ points.reserve(1000);
+ for (int i = 0; i < 1000; i++) {
+ points.push_back({i, i * 2});
+ }
+ SumVector(points);
+}
+
+
+Do not use protobufs unnecessarily.
+Given the factor of 20 performance difference described above, if some data is
+never serialized or parsed, you probably should not put it in a protocol buffer.
+The purpose of protocol buffers is to make it easy to serialize and deserialize
+data structures, but they can have significant code-size, memory, and CPU
+overheads. Do not use them if all you want are some of the other niceties like
+DebugString and copyability.
Avoid unnecessary message hierarchies.
+Message hierarchy can be useful to organize information in a more readable +fashion. However, the extra level of message hierarchy incurs overheads like +memory allocations, function calls, cache misses, larger serialized messages, +etc.
+E.g., instead of:
+message Foo {
+ optional Bar bar = 1;
+}
+message Bar {
+ optional Baz baz = 1;
+}
+message Baz {
+ optional int32 count = 1;
+}
+
+Prefer:
+message Foo {
+ optional int32 count = 1;
+}
+
+A protocol buffer message corresponds to a message class in C++ generated code +and emits a tag and the length of the payload on the wire. To carry an integer, +the old form requires more allocations (and deallocations) and emits a larger +amount of generated code. As a result, all protocol buffer operations (parsing, +serialization, size, etc.) become more expensive, having to traverse the message +tree. The new form does not have such overhead and is more efficient.
+ +Use small field numbers for frequently occurring fields.
+Protobufs use a variable length integer representation for the combination of +field number and wire format (see the +protobuf encoding documentation). +This representation is 1 byte for field numbers between 1 and 15, and two bytes +for field numbers between 16 and 2047. (Field numbers 2048 or greater should +typically be avoided.)
+Consider pre-reserving some small field numbers for future extension of +performance-sensitive protobufs.
+ +Choose carefully between int32, sint32, fixed32, and uint32 (and +similarly for the 64 bit variants).
+Generally, use int32 or int64, but use fixed32 or fixed64 for large
+values like hash codes and sint32 or sint64 for values are that are often
+negative.
A varint occupies fewer bytes to encode small integers and can save space at the +cost of more expensive decoding. However, it can take up more space for negative +or large values. In that case, using fixed32 or fixed64 (instead of uint32 or +uint64) reduces size with much cheaper encoding and decoding. For small negative +integers, use sint32 or sint64 instead of int32 or int64.d
+ +For proto2, pack repeated numeric fields by annotating them with
+[packed=true].
In proto2, repeated values are serialized as a sequence of (tag, value) pairs by +default. This is inefficient because tags have to be decoded for every element.
+Packed repeated primitives are serialized with the length of the payload first +followed by values without tags. When using fixed-width values, we can avoid +reallocations by knowing the final size the moment we start parsing; i.e., no +reallocation cost. We still don't know how many varints are in the payload and +may have to pay the reallocation cost.
+In proto3, repeated fields are packed by default.
+Packed works best with fixed-width values like fixed32, fixed64, float, double, +etc. since the entire encoded length can be predetermined by multiplying the +number of elements by the fixed value size, instead of having to calculate the +length of each individual element.
+ +Use bytes instead for string for binary data
+and large values.
The string type holds UTF8-encoded text, and can sometimes require validation.
+The bytes type can hold an arbitrary sequence of bytes (non-text data) and is
+often more appropriate as well as more efficient than string.
Consider string_type = VIEW to avoid copying.
Copying a big string or bytes field during parsing is expensive. Such cost can
+often be avoided by marking the field with string_type = VIEW.
message Image {
+ ...
+ bytes jpeg_encoding = 4 [features.(pb.cpp).string_type=VIEW];
+}
+
+Without the VIEW annotation, when the protocol buffer is parsed, the
+potentially large field contents are copied from the serialized protocol buffer
+to a string object in memory. Depending on the number of string or bytes fields
+and the size of those fields, the overhead of copying can be significant.
Instead of copying the big binary blobs, routines like
+ParseFromStringWithAliasing use absl::string_view to reference the original
+backing string. Note that the backing string (the serialized protocol buffer)
+must outlive the protocol buffer instance that contains the alias.
Consider using Cord for large fields to reduce copying
+costs.
Annotating large bytes and string fields with [ctype=CORD] may reduce
+copying costs. This annotation changes the representation of the field from
+std::string to absl::Cord. absl::Cord uses reference counting and
+tree-based storage to reduce copying and appending costs. If a protocol buffer
+is serialized to a cord, parsing a string or bytes field with [ctype=CORD] can
+avoid copying the field contents.
message Document {
+ ...
+ bytes html = 4 [ctype = CORD];
+}
+
+Performance of a Cord field depends on length distribution and access patterns. +Use benchmarks to validate such changes.
+ +Use protobuf arenas in C++ code.
+Consider using arenas to save allocation and deallocation costs, especially for +protobufs containing repeated, string, or message fields.
+Message and string fields are heap-allocated (even if the top-level protocol +buffer object is stack-allocated). If a protocol buffer message has a lot of sub +message fields and string fields, allocation and deallocation cost can be +significant. Arenas amortize allocation costs and makes deallocation virtually +free. It also improves memory locality by allocating from contiguous chunks of +memory.
+ +Keep .proto files small
+Do not put too many messages in a single .proto file. Once you rely on anything
+at all from a .proto file, the entire file will get pulled in by the linker even
+if it's mostly unused. This increases build times and binary sizes. You can use
+extensions and Any to avoid creating hard dependencies on big
+.proto files with many message types.
Consider storing protocol buffers in serialized form, even in memory.
+In-memory protobuf objects have a large memory footprint (often 5x the wire +format size), potentially spread across many cache lines. So if your application +is going to keep many protobuf objects live for long periods of time, consider +storing them in serialized form.
+ +Avoid protobuf map fields.
+Protobuf map fields have performance problems that usually outweigh the small +syntactic convenience they provide. Prefer using non-protobuf maps initialized +from protobuf contents:
+msg.proto
+map<string, bytes> env_variables = 5;
+
+message Var {
+ string key = 1;
+ bytes value = 2;
+}
+repeated Var env_variables = 5;
+
+
+Use protobuf message definition with a subset of the fields.
+If you want to access only a few fields of a large message type, consider +defining your own protocol buffer message type that mimics the original type, +but only defines the fields that you care about. Here's an example:
+message FullMessage {
+ optional int32 field1 = 1;
+ optional BigMessage field2 = 2;
+ optional int32 field3 = 3;
+ repeater AnotherBigMessage field4 = 4;
+ ...
+ optional int32 field100 = 100;
+}
+
+message SubsetMessage {
+ optional int32 field3 = 3;
+ optional int32 field88 = 88;
+}
+
+By parsing a serialized FullMessage into a SubsetMessage, only two out of a
+hundred fields are parsed and others are treated as unknown fields. Consider
+using APIs that discard unknown fields to improve performance even more when
+appropriate.
Reuse protobuf objects when possible.
+Declare protobuf objects outside loops so that their allocated storage can be +reused across loop iterations.
+ +Speed up LanguageFromCode (use absl::flat_hash_map +instead of a __gnu_cxx::hash_map).
+languages.cc
+ +class CodeToLanguage
+ ...
+ : public __gnu_cxx::hash_map<absl::string_view, i18n::languages::Language,
+ CodeHash, CodeCompare> {
+
+class CodeToLanguage
+ ...
+ : public absl::flat_hash_map<absl::string_view, i18n::languages::Language,
+ CodeHash, CodeCompare> {
+
+Benchmark results:
+name old time/op new time/op delta
+BM_CodeToLanguage 19.4ns ± 1% 10.2ns ± 3% -47.47% (p=0.000 n=8+10)
+
+
+Speed up stats publish/unpublish (an older change, so +uses dense_hash_map instead of absl::flat_hash_map, which did not exist at the +time).
+publish.cc
+ +typedef hash_map<uint64, Publication*> PublicationMap;
+static PublicationMap* publications = NULL;
+
+typedef dense_hash_map<uint64, Publication*> PublicationMap;;
+static PublicationMap* publications GUARDED_BY(mu) = NULL;
+
+Use dense_hash_map instead of hash_map for keeping track of +SelectServer alarms (would use absl::flat_hash_map today).
+alarmer.h
+ +typedef hash_map<int, Alarm*> AlarmList;
+
+typedef dense_hash_map<int, Alarm*> AlarmList;
+
+Use btree_set instead of std::set to represent a very heavily used +work-queue.
+register_allocator.h
+using container_type = std::set<WorklistItem>;
+
+using container_type = absl::btree_set<WorklistItem>;
+
+
+Use InlinedBitVector instead of std::vector<bool>, and +then use FindNextBitSet to find the next item of interest.
+block_encoder.cc
+ +vector<bool> live_reads(nreads);
+...
+for (int offset = 0; offset < b_.block_width(); offset++) {
+ ...
+ for (int r = 0; r < nreads; r++) {
+ if (live_reads[r]) {
+
+util::bitmap::InlinedBitVector<4096> live_reads(nreads);
+...
+for (int offset = 0; offset < b_.block_width(); offset++) {
+ ...
+ for (size_t r = 0; live_reads.FindNextSetBit(&r); r++) {
+ DCHECK(live_reads[r]);
+
+Use InlinedVector instead of std::vector in various places.
+bundle.h
+class Bundle {
+ public:
+ ...
+ private:
+ // Sequence of (slotted instruction, unslotted immediate operands).
+ std::vector<InstructionRecord> instructions_;
+ ...
+};
+
+class Bundle {
+ public:
+ ...
+ private:
+ // Sequence of (slotted instruction, unslotted immediate operands).
+ absl::InlinedVector<InstructionRecord, 2> instructions_;
+ ...
+};
+
+
+Simple type change saves ~8TiB of memory in Spanner.
+table_ply.h
+ +class TablePly {
+ ...
+ // Returns the set of data columns stored in this file for this table.
+ const std::vector<FamilyId>& modified_data_columns() const {
+ return modified_data_columns_;
+ }
+ ...
+ private:
+ ...
+ std::vector<FamilyId> modified_data_columns_; // Data columns in the table.
+
+#include "util/gtl/vector32.h"
+ ...
+ // Returns the set of data columns stored in this file for this table.
+ absl::Span<const FamilyId> modified_data_columns() const {
+ return modified_data_columns_;
+ }
+ ...
+
+ ...
+ // Data columns in the table.
+ gtl::vector32<FamilyId> modified_data_columns_;
+
+Use gtl::small_map in tflite_model.
+tflite_model.cc
+ +using ChoiceIdToContextMap = gtl::flat_hash_map<int, TFLiteContext*>;
+
+using ChoiceIdToContextMap =
+ gtl::small_map<gtl::flat_hash_map<int, TFLiteContext*>>;
+
+Use gtl::small_ordered_set to hold set of listeners.
+broadcast_stream.h
+ +class BroadcastStream : public ParsedRtpTransport {
+ ...
+ private:
+ ...
+ std::set<ParsedRtpTransport*> listeners_ ABSL_GUARDED_BY(listeners_mutex_);
+};
+
+class BroadcastStream : public ParsedRtpTransport {
+ ...
+ private:
+ ...
+ using ListenersSet =
+ gtl::small_ordered_set<std::set<ParsedRtpTransport*>, 10>;
+ ListenersSet listeners_ ABSL_GUARDED_BY(listeners_mutex_);
+
+Use intrusive_list to keep track of inflight requests for +each index row update.
+row-update-sender-inflight-set.h
+ +std::set<int64> inflight_requests_ GUARDED_BY(mu_);
+
+class SeqNum : public gtl::intrusive_link<SeqNum> {
+ ...
+ int64 val_ = -1;
+ ...
+};
+...
+gtl::intrusive_list<SeqNum> inflight_requests_ GUARDED_BY(mu_);
+
+Avoid StatusOr<int64> return type for +RoundUpToAlignment() function.
+best_fit_allocator.cc
+ +absl::StatusOr<int64> BestFitAllocator::RoundUpToAlignment(int64 bytes) const {
+ TPU_RET_CHECK_GE(bytes, 0);
+
+ const int64 max_aligned = MathUtil::RoundDownTo<int64>(
+ std::numeric_limits<int64>::max(), alignment_in_bytes_);
+ if (bytes > max_aligned) {
+ return util::ResourceExhaustedErrorBuilder(ABSL_LOC)
+ << "Attempted to allocate "
+ << strings::HumanReadableNumBytes::ToString(bytes)
+ << " which after aligning to "
+ << strings::HumanReadableNumBytes::ToString(alignment_in_bytes_)
+ << " cannot be expressed as an int64.";
+ }
+
+ return MathUtil::RoundUpTo<int64>(bytes, alignment_in_bytes_);
+}
+
+best_fit_allocator.h
+ +// Rounds bytes up to nearest multiple of alignment_.
+// REQUIRES: bytes >= 0.
+// REQUIRES: result does not overflow int64.
+// REQUIRES: alignment_in_bytes_ is a power of 2 (checked in constructor).
+int64 RoundUpToAlignment(int64 bytes) const {
+ DCHECK_GE(bytes, 0);
+ DCHECK_LE(bytes, max_aligned_bytes_);
+ int64 result =
+ ((bytes + (alignment_in_bytes_ - 1)) & ~(alignment_in_bytes_ - 1));
+ DCHECK_EQ(result, MathUtil::RoundUpTo<int64>(bytes, alignment_in_bytes_));
+ return result;
+}
+
+Add ShapeUtil::ForEachIndexNoStatus to avoid creating a +Status return object for every element of a tensor.
+shape_util.h
+ +using ForEachVisitorFunction =
+ absl::FunctionRef<StatusOr<bool>(absl::Span<const int64_t>)>;
+ ...
+static void ForEachIndex(const Shape& shape, absl::Span<const int64_t> base,
+ absl::Span<const int64_t> count,
+ absl::Span<const int64_t> incr,
+ const ForEachVisitorFunction& visitor_function);
+
+
+using ForEachVisitorFunctionNoStatus =
+ absl::FunctionRef<bool(absl::Span<const int64_t>)>;
+ ...
+static void ForEachIndexNoStatus(
+ const Shape& shape, absl::Span<const int64_t> base,
+ absl::Span<const int64_t> count, absl::Span<const int64_t> incr,
+ const ForEachVisitorFunctionNoStatus& visitor_function);
+
+literal.cc
+ +ShapeUtil::ForEachIndex(
+ result_shape, [&](absl::Span<const int64_t> output_index) {
+ for (int64_t i = 0, end = dimensions.size(); i < end; ++i) {
+ scratch_source_index[i] = output_index[dimensions[i]];
+ }
+ int64_t dest_index = IndexUtil::MultidimensionalIndexToLinearIndex(
+ result_shape, output_index);
+ int64_t source_index = IndexUtil::MultidimensionalIndexToLinearIndex(
+ shape(), scratch_source_index);
+ memcpy(dest_data + primitive_size * dest_index,
+ source_data + primitive_size * source_index, primitive_size);
+ return true;
+ });
+
+ShapeUtil::ForEachIndexNoStatus(
+ result_shape, [&](absl::Span<const int64_t> output_index) {
+ // Compute dest_index
+ int64_t dest_index = IndexUtil::MultidimensionalIndexToLinearIndex(
+ result_shape, result_minor_to_major, output_index);
+
+ // Compute source_index
+ int64_t source_index;
+ for (int64_t i = 0, end = dimensions.size(); i < end; ++i) {
+ scratch_source_array[i] = output_index[dimensions[i]];
+ }
+ if (src_shape_dims == 1) {
+ // Fast path for this case
+ source_index = scratch_source_array[0];
+ DCHECK_EQ(source_index,
+ IndexUtil::MultidimensionalIndexToLinearIndex(
+ src_shape, src_minor_to_major, scratch_source_span));
+ } else {
+ source_index = IndexUtil::MultidimensionalIndexToLinearIndex(
+ src_shape, src_minor_to_major, scratch_source_span);
+ }
+ // Move one element from source_index in source to dest_index in dest
+ memcpy(dest_data + PRIMITIVE_SIZE * dest_index,
+ source_data + PRIMITIVE_SIZE * source_index, PRIMITIVE_SIZE);
+ return true;
+ });
+
+In TF_CHECK_OK, avoid creating Ok object in order to test +for ok().
+status.h
+ +#define TF_CHECK_OK(val) CHECK_EQ(::tensorflow::Status::OK(), (val))
+#define TF_QCHECK_OK(val) QCHECK_EQ(::tensorflow::Status::OK(), (val))
+
+extern tensorflow::string* TfCheckOpHelperOutOfLine(
+ const ::tensorflow::Status& v, const char* msg);
+inline tensorflow::string* TfCheckOpHelper(::tensorflow::Status v,
+ const char* msg) {
+ if (v.ok()) return nullptr;
+ return TfCheckOpHelperOutOfLine(v, msg);
+}
+#define TF_CHECK_OK(val) \
+ while (tensorflow::string* _result = TfCheckOpHelper(val, #val)) \
+ LOG(FATAL) << *(_result)
+#define TF_QCHECK_OK(val) \
+ while (tensorflow::string* _result = TfCheckOpHelper(val, #val)) \
+ LOG(QFATAL) << *(_result)
+
+Remove StatusOr from the hot path of remote procedure +calls (RPCs).
+Removal of StatusOr from a hot path eliminated a 14% CPU regression in RPC +benchmarks caused by an earlier change.
+privacy_context.h
+ +absl::StatusOr<privacy::context::PrivacyContext> GetRawPrivacyContext(
+ const CensusHandle& h);
+
+privacy_context_statusfree.h
+ +enum class Result {
+ kSuccess,
+ kNoRootScopedData,
+ kNoPrivacyContext,
+ kNoDDTContext,
+ kDeclassified,
+ kNoPrequestContext
+};
+...
+Result GetRawPrivacyContext(const CensusHandle& h,
+ PrivacyContext* privacy_context);
+
+absl::flat_hash_map compares one hash byte per key from a +group of keys using a single SIMD instruction.
+See Swiss Table Design Notes and +related CppCon 2017 and +CppCon 2019 talks by Matt +Kulukundis.
+raw_hash_set.h
+ +// Returns a bitmask representing the positions of slots that match hash.
+BitMask<uint32_t> Match(h2_t hash) const {
+ auto ctrl = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos));
+ auto match = _mm_set1_epi8(hash);
+ return BitMask<uint32_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(match, ctrl)));
+}
+
+Do single operations to deal with many bytes and fix +things up, rather than checking every byte what to do.
+ordered-code.cc
+ +int len = 0;
+while (val > 0) {
+ len++;
+ buf[9 - len] = (val & 0xff);
+ val >>= 8;
+}
+buf[9 - len - 1] = (unsigned char)len;
+len++;
+FastStringAppend(dest, reinterpret_cast<const char*>(buf + 9 - len), len);
+
+BigEndian::Store(val, buf + 1); // buf[0] may be needed for length
+const unsigned int length = OrderedNumLength(val);
+char* start = buf + 9 - length - 1;
+*start = length;
+AppendUpto9(dest, start, length + 1);
+
+Improve Reed-Solomon processing speed by handling +multiple interleaved input buffers more efficiently in chunks.
+Run on (12 X 3501 MHz CPUs); 2016-09-27T16:04:55.065995192-04:00
+CPU: Intel Haswell with HyperThreading (6 cores) dL1:32KB dL2:256KB dL3:15MB
+Benchmark Base (ns) New (ns) Improvement
+------------------------------------------------------------------
+BM_OneOutput/3/2 466867 351818 +24.6%
+BM_OneOutput/4/2 563130 474756 +15.7%
+BM_OneOutput/5/3 815393 688820 +15.5%
+BM_OneOutput/6/3 897246 780539 +13.0%
+BM_OneOutput/8/4 1270489 1137149 +10.5%
+BM_AllOutputs/3/2 848772 642942 +24.3%
+BM_AllOutputs/4/2 1067647 638139 +40.2%
+BM_AllOutputs/5/3 1739135 1151369 +33.8%
+BM_AllOutputs/6/3 2045817 1456744 +28.8%
+BM_AllOutputs/8/4 3012958 2484937 +17.5%
+BM_AllOutputsSetUpOnce/3/2 717310 493371 +31.2%
+BM_AllOutputsSetUpOnce/4/2 833866 600060 +28.0%
+BM_AllOutputsSetUpOnce/5/3 1537870 1137357 +26.0%
+BM_AllOutputsSetUpOnce/6/3 1802353 1398600 +22.4%
+BM_AllOutputsSetUpOnce/8/4 3166930 2455973 +22.4%
+
+
+Decode four integers at a time (circa 2004).
+Introduced a +GroupVarInt format +that encodes/decodes groups of 4 variable-length integers at a time in 5-17 +bytes, rather than one integer at a time. Decoding one group of 4 integers in +the new format takes ~1/3rd the time of decoding 4 individually varint-encoded +integers.
+groupvarint.cc
+ +const char* DecodeGroupVar(const char* p, int N, uint32* dest) {
+ assert(groupvar_initialized);
+ assert(N % 4 == 0);
+ while (N) {
+ uint8 tag = *p;
+ p++;
+
+ uint8* lenptr = &groupvar_table[tag].length[0];
+
+#define GET_NEXT \
+ do { \
+ uint8 len = *lenptr; \
+ *dest = UNALIGNED_LOAD32(p) & groupvar_mask[len]; \
+ dest++; \
+ p += len; \
+ lenptr++; \
+ } while (0)
+ GET_NEXT;
+ GET_NEXT;
+ GET_NEXT;
+ GET_NEXT;
+#undef GET_NEXT
+
+ N -= 4;
+ }
+ return p;
+}
+
+Encode groups of 4 k-bit numbers at a time.
+Added KBitStreamEncoder and KBitStreamDecoder classes to encode/decode 4 k-bit +numbers at a time into a bit stream. Since K is known at compile time, the +encoding and decoding can be quite efficient. E.g., since four numbers are +encoded at a time, the code can assume that the stream is always byte-aligned +(for even k), or nibble-aligned (for odd k).
+ +Speed up GPU memory allocator by ~40%.
+36-48% speedup in allocation/deallocation speed for GPUBFCAllocator:
+Identify chunks by a handle number, rather than by a pointer to a Chunk.
+Chunk data structures are now allocated in a vector<Chunk>, and a handle
+is an index into this vector to refer to a particular chunk. This allows the
+next and prev pointers in Chunk to be ChunkHandle (4 bytes), rather than
+Chunk* (8 bytes).
When a Chunk object is no longer in use, we maintain a free list of Chunk
+objects, whose head is designated by ChunkHandle free_chunks_list_, and
+with the Chunk->next pointing to the next free list entry. Together with
+(1), this allows us to avoid heap allocation/deallocation of Chunk objects
+in the allocator, except (rarely) when the vector<Chunk> grows. It also
+makes all the memory for Chunk objects contiguous.
Rather than having the bins_ data structure be a std::set and using +lower_bound to locate the appropriate bin given a byte_size, we instead have +an array of bins, indexed by a function that is log₂(byte_size/256). This +allows the bin to be located with a few bit operations, rather than a binary +search tree lookup. It also allows us to allocate the storage for all the +Bin data structures in a contiguous array, rather than in many different +cache lines. This reduces the number of cache lines that must be moved +around between cores when multiple threads are doing allocations.
+Added fast path to GPUBFCAllocator::AllocateRaw that first tries to allocate +memory without involving the retry_helper_. If an initial attempt fails +(returns nullptr), then we go through the retry_helper_, but normally we can +avoid several levels of procedure calls as well as the +allocation/deallocation of a std::function with several arguments.
+Commented out most of the VLOG calls. These can be reenabled selectively +when needed for debugging purposes by uncommenting and recompiling.
+Added multi-threaded benchmark to test allocation under contention.
+Speeds up ptb_word_lm on my desktop machine with a Titan X card from 8036 words +per second to 8272 words per second (+2.9%).
+Run on (40 X 2801 MHz CPUs); 2016/02/16-15:12:49
+CPU: Intel Ivybridge with HyperThreading (20 cores) dL1:32KB dL2:256KB dL3:25MB
+Benchmark Base (ns) New (ns) Improvement
+------------------------------------------------------------------
+BM_Allocation 347 184 +47.0%
+BM_AllocationThreaded/1 351 181 +48.4%
+BM_AllocationThreaded/4 2470 1975 +20.0%
+BM_AllocationThreaded/16 11846 9507 +19.7%
+BM_AllocationDelayed/1 392 199 +49.2%
+BM_AllocationDelayed/10 285 169 +40.7%
+BM_AllocationDelayed/100 245 149 +39.2%
+BM_AllocationDelayed/1000 238 151 +36.6%
+
+
+Speed up Pathways throughput by ~20% via a set of +miscellaneous changes.
+Unified a bunch of special fast descriptor parsing functions into a single +ParsedDescriptor class and use this class in more places to avoid expensive +full parse calls.
+Change several protocol buffer fields from string to bytes (avoids +unnecessary utf-8 checks and associated error handling code).
+DescriptorProto.inlined_contents is now a string, not a Cord (it is expected +to be used only for small-ish tensors). This necessitated the addition of a +bunch of copying helpers in tensor_util.cc (need to now support both strings +and Cords).
+Use flat_hash_map instead of std::unordered_map in a few places.
+Added MemoryManager::LookupMany for use by Stack op instead of calling +Lookup per batch element. This change reduces setup overhead like locking.
+Removed some unnecessary string creation in TransferDispatchOp.
+Performance results for transferring a batch of 1000 1KB tensors from one +component to another in the same process:
+Before: 227.01 steps/sec
+After: 272.52 steps/sec (+20% throughput)
+
+
+~15% XLA compiler performance improvement through a +series of changes.
+Some changes to speed up XLA compilation:
+In SortComputationsByContent, return false if a == b in comparison function, +to avoid serializing and fingerprinting long computation strings.
+Turn CHECK into DCHECK to avoid touching an extra cache line in +HloComputation::ComputeInstructionPostOrder
+Avoid making an expensive copy of the front instruction in +CoreSequencer::IsVectorSyncHoldSatisfied().
+Rework 2-argument HloComputation::ToString and HloComputation::ToCord +routines to do the bulk of the work in terms of appending to std::string, +rather than appending to a Cord.
+Change PerformanceCounterSet::Increment to just do a single hash table +lookup rather than two.
+Streamline Scoreboard::Update code
+Overall speedup of 14% in XLA compilation time for one important +model.
+ +Speed up low level logging in Google Meet application +code.
+Speed up ScopedLogId, which is on the critical path for each packet.
+LOG_EVERY_N(ERROR, ...) messages that seemed to be there only
+to see if invariants were violated.LOG_EVERY_N_SECONDS(ERROR, ...) statements, they are now small enough to
+inline.InlinedVector<...> for maintaining the thread local state. Since we
+never were growing beyond size 4 anyway, the InlinedVector's functionality
+was more general than needed.Base: Baseline plus the code in scoped_logid_test.cc to add the benchmark
+New: This changelist
+
+CPU: Intel Ivybridge with HyperThreading (20 cores) dL1:32KB dL2:256KB dL3:25MB
+Benchmark Base (ns) New (ns) Improvement
+----------------------------------------------------------------------------
+BM_ScopedLogId/threads:1 8 4 +52.6%
+BM_ScopedLogId/threads:2 8 4 +51.9%
+BM_ScopedLogId/threads:4 8 4 +52.9%
+BM_ScopedLogId/threads:8 8 4 +52.1%
+BM_ScopedLogId/threads:16 11 6 +44.0%
+
+
+
+Reduce XLA compilation time by ~31% by improving Shape +handling.
+Several changes to improve XLA compiler performance:
+Improved performance of ShapeUtil::ForEachIndex... iteration in a few ways:
+In ShapeUtil::ForEachState, save just pointers to the arrays represented +by the spans, rather than the full span objects.
+Pre-form a ShapeUtil::ForEachState::indexes_span pointing at the +ShapeUtil::ForEachState::indexes vector, rather than constructing this +span from the vector on every loop iteration.
+Save a ShapeUtil::ForEachState::indexes_ptr pointer to the backing store +of the ShapeUtil::ForEachState::indexes vector, allowing simple array +operations in ShapeUtil::ForEachState::IncrementDim(), rather than more +expensive vector::operator[] operations.
+Save a ShapeUtil::ForEachState::minor_to_major array pointer initialized +in the constructor by calling shape.layout().minor_to_major().data() +rather than calling LayoutUtil::Minor(...) for each dimension for each +iteration.
+Inlined the ShapeUtil::ForEachState constructor and the +ShapeUtil::ForEachState::IncrementDim() routines
+Improved the performance of ShapeUtil::ForEachIndex iteration for call sites
+that don't need the functionality of returning a Status in the passed in
+function. Did this by introducing ShapeUtil::ForEachIndexNoStatus variants,
+which accept a ForEachVisitorFunctionNoStatus (which returns a plain bool).
+This is faster than the ShapeUtil::ForEachIndex routines, which accept a
+ForEachVisitorFunction (which returns a StatusOr<bool>, which requires an
+expensive StatusOr<bool> destructor call per element that we iterate
+over).
Improved performance of LiteralBase::Broadcast in several ways:
+Introduced templated BroadcastHelper routine in literal.cc that is +specialized for different primitive byte sizes (without this, +primitive_size was a runtime variable and so the compiler couldn't do a +very good job of optimizing the memcpy that occurred per element, and +would invoke the general memcpy path that assumes the byte count is +fairly large, even though in our case it is a tiny power of 2 (typically +1, 2, 4, or 8)).
+Avoided all but one of ~(5 + num_dimensions + num_result_elements) +virtual calls per Broadcast call by making a single call to 'shape()' at +the beginning of the LiteralBase::Broadcast routine. The innocuous +looking 'shape()' calls that were sprinkled throughout end up boiling +down to "root_piece().subshape()", where subshape() is a virtual +function.
+In the BroadcastHelper routine, Special-cased the source dimensions +being one and avoided a call to +IndexUtil::MultiDimensionalIndexToLinearIndex for this case.
+In BroadcastHelper, used a scratch_source_array pointer variable that +points into the backing store of the scratch_source_index vector, and +used that directly to avoid vector::operator[] operations inside the +per-element code. Also pre-computed a scratch_source_span that points to +the scratch_source_index vector outside the per-element loop in +BroadcastHelper, to avoid constructing a span from the vector on each +element.
+Introduced new three-argument variant of +IndexUtil::MultiDimensionalIndexToLinearIndex where the caller passes in +the minor_to_major span associated with the shape argument. Used this in +BroadcastHelper to compute this for the src and dst shapes once per +Broadcast, rather than once per element copied.
+In ShardingPropagation::GetShardingFromUser, for the HloOpcode::kTuple case, +only call user.sharding().GetSubSharding(...) if we have found the operand +to be of interest. Avoiding calling it eagerly reduces CPU time in this +routine for one lengthy compilation from 43.7s to 2.0s.
+Added benchmarks for ShapeUtil::ForEachIndex and Literal::Broadcast and for +the new ShapeUtil::ForEachIndexNoStatus.
+Base is with the benchmark additions of
+BM_ForEachIndex and BM_BroadcastVectorToMatrix (and BUILD file change to add
+benchmark dependency), but no other changes.
+
+New is this cl
+
+Run on (72 X 1357.56 MHz CPU s) CPU Caches: L1 Data 32 KiB (x36)
+L1 Instruction 32 KiB (x36) L2 Unified 1024 KiB (x36) L3 Unified 25344 KiB (x2)
+
+Benchmark Base (ns) New (ns) Improvement
+----------------------------------------------------------------------------
+BM_MakeShape 18.40 18.90 -2.7%
+BM_MakeValidatedShape 35.80 35.60 +0.6%
+BM_ForEachIndex/0 57.80 55.80 +3.5%
+BM_ForEachIndex/1 90.90 85.50 +5.9%
+BM_ForEachIndex/2 1973606 1642197 +16.8%
+
+The newly added ForEachIndexNoStatus is considerably faster than the +ForEachIndex variant (it only exists in this new cl, but the benchmark work that +is done by BM_ForEachIndexNoStatus/NUM is comparable to the BM_ForEachIndex/NUM +results above).
+Benchmark Base (ns) New (ns) Improvement
+----------------------------------------------------------------------------
+BM_ForEachIndexNoStatus/0 0 46.90 ----
+BM_ForEachIndexNoStatus/1 0 65.60 ----
+BM_ForEachIndexNoStatus/2 0 1001277 ----
+
+Broadcast performance improves by ~58%.
+Benchmark Base (ns) New (ns) Improvement
+----------------------------------------------------------------------------
+BM_BroadcastVectorToMatrix/16/16 5556 2374 +57.3%
+BM_BroadcastVectorToMatrix/16/1024 319510 131075 +59.0%
+BM_BroadcastVectorToMatrix/1024/1024 20216949 8408188 +58.4%
+
+Macro results from doing ahead-of-time compilation of a large language model +(program does more than just the XLA compilation, but spends a bit less than +half its time in XLA-related code):
+Baseline program overall: 573 seconds With this cl program overall: 465 seconds +(+19% improvement)
+Time spent in compiling the two largest XLA programs in running this program:
+Baseline: 141s + 143s = 284s With this CL: 99s + 95s = 194s (+31% improvement)
+ +Reduce compilation time for large programs by ~22% in +Plaque (a distributed execution framework).
+Small tweaks to speed up compilation by ~22%.
+pair<package, opname> instead of a btree of btrees.Measurement of speed on large programs (~45K ops):
+name old time/op new time/op delta
+BM_CompileLarge 28.5s ± 2% 22.4s ± 2% -21.61% (p=0.008 n=5+5)
+
+
+MapReduce improvements (~2X speedup for wordcount +benchmark).
+Mapreduce speedups:
+The combiner data structures for the SafeCombinerMapOutput class have been
+changed. Rather than using a hash_multimap<SafeCombinerKey, StringPiece>,
+which had a hash table entry for each unique key/value inserted in the
+table, we instead use a hash_map<SafeCombinerKey, ValuePtr*> (where
+ValuePtr is a linked list of values and repetition counts). This helps in
+three ways:
It significantly reduces memory usage, since we only use +"sizeof(ValuePtr) + value_len" bytes for each value, rather than +"sizeof(SafeCombinerKey) + sizeof(StringPiece) + value_len + new hash +table entry overhead" for each value. This means that we flush the +reducer buffer less often.
+It's significantly faster, since we avoid extra hash table entries when +we're inserting a new value for a key that already exists in the table +(and instead we just hook the value into the linked list of values for +that key).
+Since we associate a repetition count with each value in the linked +list, we can represent this sequence:
+Output(key, "1");
+Output(key, "1");
+Output(key, "1");
+Output(key, "1");
+Output(key, "1");
+
+as a single entry in the linked list for "key" with a repetition count of 5. +Internally we yield "1" five times to the user-level combining function. (A +similar trick could be applied on the reduce side, perhaps).
+(Minor) Added a test for "nshards == 1" to the default +MapReductionBase::KeyFingerprintSharding function that avoids fingerprinting +the key entirely if we are just using 1 reduce shard (since we can just +return 0 directly in that case without examining the key).
+Turned some VLOG(3) statements into DVLOG(3) in the code path that is called +for each key/value added to the combiner.
+Reduces time for one wordcount benchmark from 12.56s to 6.55s.
+ +Rework the alarm handling code in the SelectServer to +significantly improve its performance (adding+removing an alarm from 771 ns to +271 ns).
+Reworked the alarm handling code in the SelectServer to significantly improve +its performance.
+Changes:
+Switched to using AdjustablePriorityQueue<Alarm> instead of a a
+set<Alarm*> for the AlarmQueue. This significantly speeds up alarm
+handling, reducing the time taken to add and remove an alarm from 771
+nanoseconds to 281 nanoseconds. This change avoids an
+allocation/deallocation per alarm setup (for the red-black tree node in the
+STL set object), and also gives much better cache locality (since the
+AdjustablePriorityQueue is a heap implemented in a vector, rather than a
+red-black tree), there are fewer cache lines touched when manipulating the
+AlarmQueue on every trip through the selectserver loop.
Converted AlarmList in Alarmer from a hash_map to a dense_hash_map to avoid +another allocation/deallocation per alarm addition/deletion (this also +improves cache locality when adding/removing alarms).
+Removed the num_alarms_stat_ and num_closures_stat_
+MinuteTenMinuteHourStat objects, and the corresponding exported variables.
+Although monitoring these seems nice, in practice they add significant
+overhead to critical networking code. If I had left these variables in as
+Atomic32 variables instead of MinuteTenMinuteHourStat, they would have still
+increased the cost of adding and removing alarms from 281 nanoseconds to 340
+nanoseconds.
Benchmark results
+Benchmark Time(ns) CPU(ns) Iterations
+-----------------------------------------------------------
+BM_AddAlarm/1 902 771 777777
+
+With this change
+Benchmark Time(ns) CPU(ns) Iterations
+-----------------------------------------------------------
+BM_AddAlarm/1 324 281 2239999
+
+
+3.3X performance in index serving speed!
+We found a number of performance issues when planning a switch from on-disk to +in-memory index serving in 2001. This change fixed many of these problems and +took us from 150 to over 500 in-memory queries per second (for a 2 GB in-memory +index on dual processor Pentium III machine).
+