//===-- tsan_clock.h --------------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file is a part of ThreadSanitizer (TSan), a race detector.
//
//===----------------------------------------------------------------------===//
#ifndef TSAN_CLOCK_H
#define TSAN_CLOCK_H

#include "tsan_defs.h"
#include "tsan_dense_alloc.h"

namespace __tsan {

typedef DenseSlabAlloc<ClockBlock, 1 << 22, 1 << 10> ClockAlloc;
typedef DenseSlabAllocCache ClockCache;

// The clock that lives in sync variables (mutexes, atomics, etc).
class SyncClock {
 public:
  SyncClock();
  ~SyncClock();

  uptr size() const;

  // These are used only in tests.
  u64 get(unsigned tid) const;
  u64 get_clean(unsigned tid) const;

  void Resize(ClockCache *c, uptr nclk);
  void Reset(ClockCache *c);

  void DebugDump(int(*printf)(const char *s, ...));

  // Clock element iterator.
  // Note: it iterates only over the table without regard to dirty entries.
  class Iter {
   public:
    explicit Iter(SyncClock* parent);
    Iter& operator++();
    bool operator!=(const Iter& other);
    ClockElem &operator*();

   private:
    SyncClock *parent_;
    // [pos_, end_) is the current continuous range of clock elements.
    ClockElem *pos_;
    ClockElem *end_;
    int block_;  // Current number of second level block.

    NOINLINE void Next();
  };

  Iter begin();
  Iter end();

 private:
  friend class ThreadClock;
  friend class Iter;
  static const uptr kDirtyTids = 2;

  struct Dirty {
    u32 tid() const { return tid_ == kShortInvalidTid ? kInvalidTid : tid_; }
    void set_tid(u32 tid) {
      tid_ = tid == kInvalidTid ? kShortInvalidTid : tid;
    }
    u64 epoch : kClkBits;

   private:
    // Full kInvalidTid won't fit into Dirty::tid.
    static const u64 kShortInvalidTid = (1ull << (64 - kClkBits)) - 1;
    u64 tid_ : 64 - kClkBits;  // kInvalidId if not active
  };

  static_assert(sizeof(Dirty) == 8, "Dirty is not 64bit");

  unsigned release_store_tid_;
  unsigned release_store_reused_;
  Dirty dirty_[kDirtyTids];
  // If size_ is 0, tab_ is nullptr.
  // If size <= 64 (kClockCount), tab_ contains pointer to an array with
  // 64 ClockElem's (ClockBlock::clock).
  // Otherwise, tab_ points to an array with up to 127 u32 elements,
  // each pointing to the second-level 512b block with 64 ClockElem's.
  // Unused space in the first level ClockBlock is used to store additional
  // clock elements.
  // The last u32 element in the first level ClockBlock is always used as
  // reference counter.
  //
  // See the following scheme for details.
  // All memory blocks are 512 bytes (allocated from ClockAlloc).
  // Clock (clk) elements are 64 bits.
  // Idx and ref are 32 bits.
  //
  // tab_
  //    |
  //    \/
  //    +----------------------------------------------------+
  //    | clk128 | clk129 | ...unused... | idx1 | idx0 | ref |
  //    +----------------------------------------------------+
  //                                        |      |
  //                                        |      \/
  //                                        |      +----------------+
  //                                        |      | clk0 ... clk63 |
  //                                        |      +----------------+
  //                                        \/
  //                                        +------------------+
  //                                        | clk64 ... clk127 |
  //                                        +------------------+
  //
  // Note: dirty entries, if active, always override what's stored in the clock.
  ClockBlock *tab_;
  u32 tab_idx_;
  u16 size_;
  u16 blocks_;  // Number of second level blocks.

  void Unshare(ClockCache *c);
  bool IsShared() const;
  bool Cachable() const;
  void ResetImpl();
  void FlushDirty();
  uptr capacity() const;
  u32 get_block(uptr bi) const;
  void append_block(u32 idx);
  ClockElem &elem(unsigned tid) const;
};

// The clock that lives in threads.
class ThreadClock {
 public:
  typedef DenseSlabAllocCache Cache;

  explicit ThreadClock(unsigned tid, unsigned reused = 0);

  u64 get(unsigned tid) const;
  void set(ClockCache *c, unsigned tid, u64 v);
  void set(u64 v);
  void tick();
  uptr size() const;

  void acquire(ClockCache *c, SyncClock *src);
  void releaseStoreAcquire(ClockCache *c, SyncClock *src);
  void release(ClockCache *c, SyncClock *dst);
  void acq_rel(ClockCache *c, SyncClock *dst);
  void ReleaseStore(ClockCache *c, SyncClock *dst);
  void ResetCached(ClockCache *c);
  void NoteGlobalAcquire(u64 v);

  void DebugReset();
  void DebugDump(int(*printf)(const char *s, ...));

 private:
  static const uptr kDirtyTids = SyncClock::kDirtyTids;
  // Index of the thread associated with he clock ("current thread").
  const unsigned tid_;
  const unsigned reused_;  // tid_ reuse count.
  // Current thread time when it acquired something from other threads.
  u64 last_acquire_;

  // Last time another thread has done a global acquire of this thread's clock.
  // It helps to avoid problem described in:
  // https://github.com/golang/go/issues/39186
  // See test/tsan/java_finalizer2.cpp for a regression test.
  // Note the failuire is _extremely_ hard to hit, so if you are trying
  // to reproduce it, you may want to run something like:
  // $ go get golang.org/x/tools/cmd/stress
  // $ stress -p=64 ./a.out
  //
  // The crux of the problem is roughly as follows.
  // A number of O(1) optimizations in the clocks algorithm assume proper
  // transitive cumulative propagation of clock values. The AcquireGlobal
  // operation may produce an inconsistent non-linearazable view of
  // thread clocks. Namely, it may acquire a later value from a thread
  // with a higher ID, but fail to acquire an earlier value from a thread
  // with a lower ID. If a thread that executed AcquireGlobal then releases
  // to a sync clock, it will spoil the sync clock with the inconsistent
  // values. If another thread later releases to the sync clock, the optimized
  // algorithm may break.
  //
  // The exact sequence of events that leads to the failure.
  // - thread 1 executes AcquireGlobal
  // - thread 1 acquires value 1 for thread 2
  // - thread 2 increments clock to 2
  // - thread 2 releases to sync object 1
  // - thread 3 at time 1
  // - thread 3 acquires from sync object 1
  // - thread 3 increments clock to 2
  // - thread 1 acquires value 2 for thread 3
  // - thread 1 releases to sync object 2
  // - sync object 2 clock has 1 for thread 2 and 2 for thread 3
  // - thread 3 releases to sync object 2
  // - thread 3 sees value 2 in the clock for itself
  //   and decides that it has already released to the clock
  //   and did not acquire anything from other threads after that
  //   (the last_acquire_ check in release operation)
  // - thread 3 does not update the value for thread 2 in the clock from 1 to 2
  // - thread 4 acquires from sync object 2
  // - thread 4 detects a false race with thread 2
  //   as it should have been synchronized with thread 2 up to time 2,
  //   but because of the broken clock it is now synchronized only up to time 1
  //
  // The global_acquire_ value helps to prevent this scenario.
  // Namely, thread 3 will not trust any own clock values up to global_acquire_
  // for the purposes of the last_acquire_ optimization.
  atomic_uint64_t global_acquire_;

  // Cached SyncClock (without dirty entries and release_store_tid_).
  // We reuse it for subsequent store-release operations without intervening
  // acquire operations. Since it is shared (and thus constant), clock value
  // for the current thread is then stored in dirty entries in the SyncClock.
  // We host a reference to the table while it is cached here.
  u32 cached_idx_;
  u16 cached_size_;
  u16 cached_blocks_;

  // Number of active elements in the clk_ table (the rest is zeros).
  uptr nclk_;
  u64 clk_[kMaxTidInClock];  // Fixed size vector clock.

  bool IsAlreadyAcquired(const SyncClock *src) const;
  bool HasAcquiredAfterRelease(const SyncClock *dst) const;
  void UpdateCurrentThread(ClockCache *c, SyncClock *dst) const;
};

ALWAYS_INLINE u64 ThreadClock::get(unsigned tid) const {
  DCHECK_LT(tid, kMaxTidInClock);
  return clk_[tid];
}

ALWAYS_INLINE void ThreadClock::set(u64 v) {
  DCHECK_GE(v, clk_[tid_]);
  clk_[tid_] = v;
}

ALWAYS_INLINE void ThreadClock::tick() {
  clk_[tid_]++;
}

ALWAYS_INLINE uptr ThreadClock::size() const {
  return nclk_;
}

ALWAYS_INLINE void ThreadClock::NoteGlobalAcquire(u64 v) {
  // Here we rely on the fact that AcquireGlobal is protected by
  // ThreadRegistryLock, thus only one thread at a time executes it
  // and values passed to this function should not go backwards.
  CHECK_LE(atomic_load_relaxed(&global_acquire_), v);
  atomic_store_relaxed(&global_acquire_, v);
}

ALWAYS_INLINE SyncClock::Iter SyncClock::begin() {
  return Iter(this);
}

ALWAYS_INLINE SyncClock::Iter SyncClock::end() {
  return Iter(nullptr);
}

ALWAYS_INLINE uptr SyncClock::size() const {
  return size_;
}

ALWAYS_INLINE SyncClock::Iter::Iter(SyncClock* parent)
    : parent_(parent)
    , pos_(nullptr)
    , end_(nullptr)
    , block_(-1) {
  if (parent)
    Next();
}

ALWAYS_INLINE SyncClock::Iter& SyncClock::Iter::operator++() {
  pos_++;
  if (UNLIKELY(pos_ >= end_))
    Next();
  return *this;
}

ALWAYS_INLINE bool SyncClock::Iter::operator!=(const SyncClock::Iter& other) {
  return parent_ != other.parent_;
}

ALWAYS_INLINE ClockElem &SyncClock::Iter::operator*() {
  return *pos_;
}
}  // namespace __tsan

#endif  // TSAN_CLOCK_H