1
    2
    3
    4
    5
    6
    7
    8
    9
   10
   11
   12
   13
   14
   15
   16
   17
   18
   19
   20
   21
   22
   23
   24
   25
   26
   27
   28
   29
   30
   31
   32
   33
   34
   35
   36
   37
   38
   39
   40
   41
   42
   43
   44
   45
   46
   47
   48
   49
   50
   51
   52
   53
   54
   55
   56
   57
   58
   59
   60
   61
   62
   63
   64
   65
   66
   67
   68
   69
   70
   71
   72
   73
   74
   75
   76
   77
   78
   79
   80
   81
   82
   83
   84
   85
   86
   87
   88
   89
   90
   91
   92
   93
   94
   95
   96
   97
   98
   99
  100
  101
  102
  103
  104
  105
  106
  107
  108
  109
  110
  111
  112
  113
  114
  115
  116
  117
  118
  119
  120
  121
  122
  123
  124
  125
  126
  127
  128
  129
  130
  131
  132
  133
  134
  135
  136
  137
  138
  139
  140
  141
  142
  143
  144
  145
  146
  147
  148
  149
  150
  151
  152
  153
  154
  155
  156
  157
  158
  159
  160
  161
  162
  163
  164
  165
  166
  167
  168
  169
  170
  171
  172
  173
  174
  175
  176
  177
  178
  179
  180
  181
  182
  183
  184
  185
  186
  187
  188
  189
  190
  191
  192
  193
  194
  195
  196
  197
  198
  199
  200
  201
  202
  203
  204
  205
  206
  207
  208
  209
  210
  211
  212
  213
  214
  215
  216
  217
  218
  219
  220
  221
  222
  223
  224
  225
  226
  227
  228
  229
  230
  231
  232
  233
  234
  235
  236
  237
  238
  239
  240
  241
  242
  243
  244
  245
  246
  247
  248
  249
  250
  251
  252
  253
  254
  255
  256
  257
  258
  259
  260
  261
  262
  263
  264
  265
  266
  267
  268
  269
  270
  271
  272
  273
  274
  275
  276
  277
  278
  279
  280
  281
  282
  283
  284
  285
  286
  287
  288
  289
  290
  291
  292
  293
  294
  295
  296
  297
  298
  299
  300
  301
  302
  303
  304
  305
  306
  307
  308
  309
  310
  311
  312
  313
  314
  315
  316
  317
  318
  319
  320
  321
  322
  323
  324
  325
  326
  327
  328
  329
  330
  331
  332
  333
  334

gpu / ipc / service / gpu_watchdog_thread.h [blame]

// Copyright 2012 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef GPU_IPC_SERVICE_GPU_WATCHDOG_THREAD_H_
#define GPU_IPC_SERVICE_GPU_WATCHDOG_THREAD_H_

#include "base/atomicops.h"
#include "base/memory/raw_ptr.h"
#include "base/memory/raw_ptr_exclusion.h"
#include "base/memory/weak_ptr.h"
#include "base/power_monitor/power_observer.h"
#include "base/task/task_observer.h"
#include "base/threading/thread.h"
#include "base/time/time.h"
#include "build/build_config.h"
#include "build/chromecast_buildflags.h"
#include "gpu/ipc/common/gpu_watchdog_timeout.h"
#include "gpu/ipc/service/gpu_ipc_service_export.h"
#include "ui/gfx/native_widget_types.h"
#include "ui/gl/progress_reporter.h"

namespace gpu {

// These values are persisted to logs. Entries should not be renumbered and
// numeric values should never be reused.
enum class GpuWatchdogThreadEvent {
  kGpuWatchdogStart,
  kGpuWatchdogKill,
  kGpuWatchdogEnd,
  kMaxValue = kGpuWatchdogEnd,
};

// These values are persisted to logs. Entries should not be renumbered and
// numeric values should never be reused.
enum class GpuWatchdogTimeoutEvent {
  // Recorded each time OnWatchdogTimeout() is called.
  kTimeout = 0,
  // Recorded when a GPU main thread is killed for a detected hang.
  kKill = 1,
  // Window only: Recorded when a hang is detected but we allow the GPU main
  // thread to continue until it spent the full
  // thread time doing the work.
  kMoreThreadTime = 2,
  // Windows only: The GPU makes progress after givenmore thread time. The GPU
  // main thread is not killed.
  kProgressAfterMoreThreadTime = 3,
  // Deprecated. A gpu hang is detected but watchdog waits for 60 seconds before
  // taking action.
  // kTimeoutWait = 4,
  // Deprecated. The GPU makes progress within 60 sec in OnWatchdogTimeout().
  // The GPU main thread is not killed.
  // kProgressAfterWait = 5,
  // Just continue if it's not on the TTY of our host X11 server.
  kContinueOnNonHostServerTty = 6,
  // Windows only: After detecting GPU hang and continuing running through
  // OnGpuWatchdogTimeout for the max cycles, the GPU main thread still cannot
  // get the full thread time.
  kLessThanFullThreadTimeAfterCapped = 7,
  // Windows only: The GPU main thread went through the
  // kLessThanFullThreadTimeAfterCapped stage before the process is killed.
  kKillOnLessThreadTime = 8,
  // OnWatchdogTimeout() is called long after the expected time. The GPU is not
  // killed this time because of the slow system.
  kSlowWatchdogThread = 9,
  kNoKillForGpuProgressDuringCrashDumping = 10,
  kMaxValue = kNoKillForGpuProgressDuringCrashDumping,
};

#if BUILDFLAG(IS_WIN)
// If the actual time the watched GPU thread spent doing actual work is less
// than the watchdog timeout, the GPU thread can continue running through
// OnGPUWatchdogTimeout for at most 4 times before the gpu thread is killed.
constexpr int kMaxCountOfMoreGpuThreadTimeAllowed = 3;
#endif
constexpr int kMaxExtraCyclesBeforeKill = 0;

// If the scheduled timeout function is delayed by more than
// kUnreasonableTimeoutDelay, we assume the system is in a unexpected state and
// the GPU watchdog will NOT terminate the GPU process if no progress is made in
// the GPU main thread or in the GPU display compositor thread. This is used in
// determining SlowWatchdogThread.
constexpr base::TimeDelta kUnreasonableTimeoutDelay = base::Seconds(5);

// A thread that intermitently sends tasks to a group of watched message loops
// and deliberately crashes if one of them does not respond after a timeout.
class GPU_IPC_SERVICE_EXPORT GpuWatchdogThread
    : public base::Thread,
      public base::PowerSuspendObserver,
      public base::TaskObserver,
      public gl::ProgressReporter {
 public:
  static std::unique_ptr<GpuWatchdogThread> Create(
      bool start_backgrounded,
      bool software_rendering,
      const std::string& thread_name);

  // Use the existing GpuWatchdogThread to create a second one. This is used
  // for DrDC thread only.
  static std::unique_ptr<GpuWatchdogThread> Create(
      bool start_backgrounded,
      const GpuWatchdogThread* existing_watchdog,
      const std::string& thread_name);

  static std::unique_ptr<GpuWatchdogThread> Create(
      bool start_backgrounded,
      base::TimeDelta timeout,
      int restart_factor,
      bool test_mode,
      const std::string& thread_name);

  GpuWatchdogThread(const GpuWatchdogThread&) = delete;
  GpuWatchdogThread& operator=(const GpuWatchdogThread&) = delete;

  ~GpuWatchdogThread() override;

  // Notifies the watchdog when Chrome is backgrounded / foregrounded. Should
  // only be used if Chrome is completely backgrounded and not expected to
  // render (all windows backgrounded and not producing frames).
  void OnBackgrounded();
  void OnForegrounded();

  // The watchdog starts armed to catch startup hangs, and needs to be disarmed
  // once init is complete, before executing tasks.
  void OnInitComplete();

  // Notifies the watchdog when the GPU child process is being destroyed.
  // This function is called directly from
  // viz::GpuServiceImpl::~GpuServiceImpl()
  void OnGpuProcessTearDown();

  // Pause the GPU watchdog to stop the timeout task. If the current heavy task
  // is not running on the GPU driver, the watchdog can be paused to avoid
  // unneeded crash.
  void PauseWatchdog();
  // Continue the watchdog after a pause.
  void ResumeWatchdog();

  // For gpu testing only. Return status for the watchdog tests
  bool IsGpuHangDetectedForTesting();

  // Implements base::Thread.
  void Init() override;
  void CleanUp() override;

  // Implements gl::ProgressReporter.
  void ReportProgress() override;

  // Implements TaskObserver.
  void WillProcessTask(const base::PendingTask& pending_task,
                       bool was_blocked_or_low_priority) override;
  void DidProcessTask(const base::PendingTask& pending_task) override;

  // Implements base::PowerSuspendObserver.
  void OnSuspend() override;
  void OnResume() override;

 protected:
  GpuWatchdogThread();

 private:
  enum PauseResumeSource {
    kAndroidBackgroundForeground = 0,
    kPowerSuspendResume = 1,
    kGeneralGpuFlow = 2,
  };

  GpuWatchdogThread(base::TimeDelta timeout,
                    int restart_factor,
                    bool test_mode,
                    const std::string& thread_name);
  void AddPowerObserver();
  void RestartWatchdogTimeoutTask(PauseResumeSource source_of_request);
  void StopWatchdogTimeoutTask(PauseResumeSource source_of_request);
  void UpdateInitializationFlag();
  void Arm();
  void Disarm();
  void InProgress();
  bool IsArmed();
  base::subtle::Atomic32 ReadArmDisarmCounter();
  void OnWatchdogTimeout();
  bool SlowWatchdogThread();
  bool WatchedThreadNeedsMoreThreadTime(bool no_gpu_hang_detected);
#if BUILDFLAG(IS_WIN)
  base::ThreadTicks GetWatchedThreadTime();
#endif

  // Do not change the function name. It is used for [GPU HANG] crash reports.
  void DeliberatelyTerminateToRecoverFromHang();
  void ContinueWithNextWatchdogTimeoutTask();

  // Records "GPU.WatchdogThread.Event".
  void GpuWatchdogThreadEventHistogram(GpuWatchdogThreadEvent thread_event);

  // Histogram recorded in OnWatchdogTimeout()
  // Records "GPU.WatchdogThread.Timeout"
  void GpuWatchdogTimeoutHistogram(GpuWatchdogTimeoutEvent timeout_event);

#if BUILDFLAG(IS_WIN)
  // Histograms recorded for WatchedThreadNeedsMoreThreadTime() function.
  void WatchedThreadNeedsMoreThreadTimeHistogram(
      bool no_gpu_hang_detected,
      bool start_of_more_thread_time);
#endif

  // Used for metrics. It's 1 minute after the event.
  bool WithinOneMinFromPowerResumed();
  bool WithinOneMinFromForegrounded();

#if BUILDFLAG(IS_LINUX) && !BUILDFLAG(IS_CASTOS)
  void UpdateActiveTTY();
#endif
  // The watchdog continues when it's not on the TTY of our host X11 server.
  bool ContinueOnNonHostX11ServerTty();

  // This counter is only written on the gpu thread, and read on both threads.
  volatile base::subtle::Atomic32 arm_disarm_counter_ = 0;
  // The counter number read in the last OnWatchdogTimeout() on the watchdog
  // thread.
  int32_t last_arm_disarm_counter_ = 0;

  // Timeout on the watchdog thread to check if gpu hangs.
  base::TimeDelta watchdog_timeout_;

  // The one-time watchdog timeout multiplier after the watchdog pauses and
  // restarts.
  const int watchdog_restart_factor_;

  // The time the gpu watchdog was created.
  base::TimeTicks watchdog_start_timeticks_;

  // The time the last OnSuspend and OnResume was called.
  base::TimeTicks power_suspend_timeticks_;
  base::TimeTicks power_resume_timeticks_;

  // The time the last OnBackgrounded and OnForegrounded was called.
  base::TimeTicks backgrounded_timeticks_;
  base::TimeTicks foregrounded_timeticks_;

  // The time PauseWatchdog and ResumeWatchdog was called.
  base::TimeTicks watchdog_pause_timeticks_;
  base::TimeTicks watchdog_resume_timeticks_;

  // TimeTicks: Tracking the amount of time a task runs. Executing delayed
  //            tasks at the right time.
  // ThreadTicks: Use this timer to (approximately) measure how much time the
  // calling thread spent doing actual work vs. being de-scheduled.

  // The time the last OnWatchdogTimeout() was called.
  base::TimeTicks last_on_watchdog_timeout_timeticks_;

  // The wall-clock time the next OnWatchdogTimeout() will be called.
  base::Time next_on_watchdog_timeout_time_;

#if BUILDFLAG(IS_WIN)
  base::ThreadTicks last_on_watchdog_timeout_thread_ticks_;

  // The difference between the timeout and the actual time the watched thread
  // spent doing actual work.
  base::TimeDelta remaining_watched_thread_ticks_;

  // The Windows thread hanndle of the watched GPU main thread.
  // RAW_PTR_EXCLUSION: This field holds windows handles
  RAW_PTR_EXCLUSION void* watched_thread_handle_ = nullptr;

  // After GPU hang detected, how many times has the GPU thread been allowed to
  // continue due to not enough thread time.
  int count_of_more_gpu_thread_time_allowed_ = 0;

  // After detecting GPU hang and continuing running through
  // OnGpuWatchdogTimeout for the max cycles, the GPU main thread still cannot
  // get the full thread time.
  bool less_than_full_thread_time_after_capped_ = false;
#endif

#if BUILDFLAG(IS_LINUX) && !BUILDFLAG(IS_CASTOS)
  struct Deleter {
    inline void operator()(FILE* f) {
      if (f)
        fclose(f);
    }
  };
  std::unique_ptr<FILE, Deleter> tty_file_;
  int host_tty_ = -1;
  int active_tty_ = -1;
  int last_active_tty_ = -1;
#endif

  // The system has entered the power suspension mode.
  bool in_power_suspension_ = false;

  // The GPU process has started tearing down. Accessed only in the gpu process.
  bool in_gpu_process_teardown_ = false;

  // Chrome is running on the background on Android. Gpu is probably very slow
  // or stalled.
  bool is_backgrounded_ = false;

  // The GPU watchdog is paused. The timeout task is temporarily stopped.
  bool is_paused_ = false;

  // whether GpuWatchdogThreadEvent::kGpuWatchdogStart has been recorded.
  bool is_watchdog_start_histogram_recorded_ = false;

  // Read/Write by the watchdog thread only after initialized in the
  // constructor.
  bool in_gpu_initialization_ = false;

  // For the experiment and the debugging purpose
  size_t num_of_timeout_after_power_resume_ = 0;
  size_t num_of_timeout_after_foregrounded_ = 0;
  bool foregrounded_event_ = false;
  bool power_resumed_event_ = false;

  // The watched thread name string used for UMA and crash key.
  std::string watched_thread_name_str_uma_;

  // The thread id string of the watched thread.
  std::string watched_thread_id_str_;

  // For gpu testing only.
  const bool is_test_mode_;

  // Set by the watchdog thread and Read by the test thread.
  base::AtomicFlag test_result_timeout_and_gpu_hang_;

  SEQUENCE_CHECKER(watched_thread_sequence_checker_);

  base::WeakPtr<GpuWatchdogThread> weak_ptr_;
  base::WeakPtrFactory<GpuWatchdogThread> weak_factory_{this};
};

}  // namespace gpu
#endif  // GPU_IPC_SERVICE_GPU_WATCHDOG_THREAD_H_