1
    2
    3
    4
    5
    6
    7
    8
    9
   10
   11
   12
   13
   14
   15
   16
   17
   18
   19
   20
   21
   22
   23
   24
   25
   26
   27
   28
   29
   30
   31
   32
   33
   34
   35
   36
   37
   38
   39
   40
   41
   42
   43
   44
   45
   46
   47
   48
   49
   50
   51
   52
   53
   54
   55
   56
   57
   58
   59
   60
   61
   62
   63
   64
   65
   66
   67
   68
   69
   70
   71
   72
   73
   74
   75
   76
   77
   78
   79
   80
   81
   82
   83
   84
   85
   86
   87
   88
   89
   90
   91
   92
   93
   94
   95
   96
   97
   98
   99
  100
  101
  102
  103
  104
  105
  106
  107
  108
  109
  110
  111
  112
  113
  114
  115
  116
  117
  118
  119
  120
  121
  122
  123
  124
  125
  126
  127
  128
  129
  130
  131
  132
  133
  134
  135
  136
  137
  138
  139
  140
  141
  142
  143
  144
  145
  146
  147
  148
  149
  150
  151
  152
  153
  154
  155
  156
  157
  158
  159
  160
  161
  162
  163
  164
  165
  166
  167
  168
  169
  170
  171
  172
  173
  174
  175
  176
  177
  178
  179
  180
  181
  182
  183
  184
  185
  186
  187
  188
  189
  190
  191
  192
  193
  194
  195
  196
  197
  198
  199
  200
  201
  202
  203
  204
  205
  206
  207
  208
  209
  210
  211
  212
  213
  214
  215
  216
  217
  218
  219
  220
  221
  222
  223
  224
  225
  226
  227
  228
  229
  230
  231
  232
  233
  234
  235
  236
  237
  238
  239
  240
  241
  242
  243
  244
  245
  246
  247
  248
  249
  250
  251
  252
  253
  254
  255
  256
  257
  258
  259
  260
  261
  262
  263
  264
  265
  266
  267
  268
  269
  270
  271
  272
  273
  274
  275
  276
  277
  278
  279
  280
  281
  282
  283
  284
  285
  286
  287
  288
  289
  290
  291
  292
  293
  294
  295
  296
  297
  298
  299
  300
  301
  302
  303
  304
  305
  306
  307
  308
  309
  310
  311
  312
  313
  314
  315
  316
  317
  318
  319
  320
  321
  322
  323
  324
  325
  326
  327
  328
  329
  330
  331
  332
  333
  334
  335
  336
  337
  338
  339
  340
  341
  342
  343
  344
  345
  346
  347
  348
  349
  350
  351
  352
  353
  354
  355
  356
  357
  358
  359
  360
  361
  362
  363
  364
  365
  366
  367
  368
  369
  370
  371
  372
  373
  374
  375
  376
  377
  378
  379
  380
  381
  382
  383
  384
  385
  386
  387
  388
  389
  390
  391
  392
  393
  394
  395
  396
  397
  398
  399
  400
  401
  402
  403
  404
  405
  406
  407
  408
  409
  410
  411
  412
  413
  414
  415
  416
  417
  418
  419
  420
  421
  422
  423
  424
  425
  426
  427
  428
  429

base / allocator / partition_allocator / src / partition_alloc / page_allocator.h [blame]

// Copyright 2013 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef PARTITION_ALLOC_PAGE_ALLOCATOR_H_
#define PARTITION_ALLOC_PAGE_ALLOCATOR_H_

#include <cstddef>
#include <cstdint>

#include "partition_alloc/build_config.h"
#include "partition_alloc/buildflags.h"
#include "partition_alloc/page_allocator_constants.h"
#include "partition_alloc/partition_alloc_base/compiler_specific.h"
#include "partition_alloc/partition_alloc_base/component_export.h"
#include "partition_alloc/thread_isolation/thread_isolation.h"

namespace partition_alloc {

struct PageAccessibilityConfiguration {
  enum Permissions {
    kInaccessible,
    // This flag is valid only with AllocPages(), where in creates kInaccessible
    // pages that may later be re-mapped as executable, on platforms which
    // distinguish never-executable and maybe-executable pages.
    kInaccessibleWillJitLater,
    kRead,
    kReadWrite,
    // This flag is mapped to kReadWrite on systems that
    // don't support MTE.
    kReadWriteTagged,
    // This flag is mapped to kReadExecute on systems
    // that don't support Arm's BTI.
    kReadExecuteProtected,
    kReadExecute,
    // This flag is mapped to `kReadWriteExecute` on systems that do not support
    // Arm's BTI.
    kReadWriteExecuteProtected,
    // This flag is deprecated and will go away soon.
    // TODO(bbudge) Remove this as soon as V8 doesn't need RWX pages.
    kReadWriteExecute,
  };

#if PA_BUILDFLAG(ENABLE_THREAD_ISOLATION)
  constexpr explicit PageAccessibilityConfiguration(Permissions permissions)
      : permissions(permissions) {}
  constexpr PageAccessibilityConfiguration(
      Permissions permissions,
      ThreadIsolationOption thread_isolation)
      : permissions(permissions), thread_isolation(thread_isolation) {}
#else
  constexpr explicit PageAccessibilityConfiguration(Permissions permissions)
      : permissions(permissions) {}
#endif  // PA_BUILDFLAG(ENABLE_THREAD_ISOLATION)

  Permissions permissions;
#if PA_BUILDFLAG(ENABLE_THREAD_ISOLATION)
  // Tag the page with a Memory Protection Key. Use 0 for none.
  ThreadIsolationOption thread_isolation;
#endif  // PA_BUILDFLAG(ENABLE_THREAD_ISOLATION)
};

// Use for De/RecommitSystemPages API.
enum class PageAccessibilityDisposition {
  // Enforces permission update (Decommit will set to
  // PageAccessibilityConfiguration::kInaccessible;
  // Recommit will set to whatever was requested, other than
  // PageAccessibilityConfiguration::kInaccessible).
  kRequireUpdate,
  // Will not update permissions, if the platform supports that (POSIX & Fuchsia
  // only).
  kAllowKeepForPerf,
};

// Some platforms (including macOS and some Linux-based ones) support tagged
// memory regions, to help in debugging. On Android, these tags are used to name
// anonymous mappings.
//
// kChromium is the default value, used to distinguish general
// Chromium-originated allocations from other ones (e.g. from platform
// libraries).
enum class PageTag {
  kSimulation = 251,      // Memory simulator tool.
  kBlinkGC = 252,         // Blink GC pages.
  kPartitionAlloc = 253,  // PartitionAlloc, no matter the partition.
  kChromium = 254,        // Chromium page.
  kV8 = 255,              // V8 heap pages.

  kFirst = kSimulation,  // Minimum tag value.
  kLast = kV8            // Maximum tag value.
};

// See
// https://github.com/apple-oss-distributions/xnu/blob/5c2921b07a2480ab43ec66f5b9e41cb872bc554f/osfmk/mach/vm_statistics.h#L687
static_assert(static_cast<int>(PageTag::kLast) >= 240,
              "The first application-reserved tag on macOS is 240, see "
              "vm_statistics.h in XNU.");
static_assert(
    static_cast<int>(PageTag::kLast) < 256,
    "Tags are only 1 byte long on macOS, see vm_statistics.h in XNU.");

PA_COMPONENT_EXPORT(PARTITION_ALLOC)
uintptr_t NextAlignedWithOffset(uintptr_t ptr,
                                uintptr_t alignment,
                                uintptr_t requested_offset);

// Allocates one or more pages.
//
// The requested |address| is just a hint; the actual address returned may
// differ. The returned address will be aligned to |align_offset| modulo |align|
// bytes.
//
// |length|, |align| and |align_offset| are in bytes, and must be a multiple of
// |PageAllocationGranularity()|. |length| and |align| must be non-zero.
// |align_offset| must be less than |align|. |align| must be a power of two.
//
// If |address| is 0/nullptr, then a suitable and randomized address will be
// chosen automatically.
//
// |accessibility| controls the permission of the allocated pages.
// PageAccessibilityConfiguration::kInaccessible means uncommitted.
//
// |page_tag| is used on some platforms to identify the source of the
// allocation.
//
// |file_descriptor_for_shared_alloc| is only used in mapping the shadow
// pools to the same physical address as the real one in
// PartitionAddressSpace::Init(). It should be ignored in other cases.
//
// This call will return 0/nullptr if the allocation cannot be satisfied.
PA_COMPONENT_EXPORT(PARTITION_ALLOC)
uintptr_t AllocPages(size_t length,
                     size_t align,
                     PageAccessibilityConfiguration accessibility,
                     PageTag page_tag = PageTag::kChromium,
                     int file_descriptor_for_shared_alloc = -1);
PA_COMPONENT_EXPORT(PARTITION_ALLOC)
uintptr_t AllocPages(uintptr_t address,
                     size_t length,
                     size_t align,
                     PageAccessibilityConfiguration accessibility,
                     PageTag page_tag = PageTag::kChromium);
PA_COMPONENT_EXPORT(PARTITION_ALLOC)
void* AllocPages(void* address,
                 size_t length,
                 size_t align,
                 PageAccessibilityConfiguration accessibility,
                 PageTag page_tag = PageTag::kChromium);
PA_COMPONENT_EXPORT(PARTITION_ALLOC)
uintptr_t AllocPagesWithAlignOffset(
    uintptr_t address,
    size_t length,
    size_t align,
    size_t align_offset,
    PageAccessibilityConfiguration page_accessibility,
    PageTag page_tag = PageTag::kChromium,
    int file_descriptor_for_shared_alloc = -1);

// Frees one or more pages starting at |address| and continuing for |length|
// bytes.
//
// |address| and |length| must match a previous call to |AllocPages|. Therefore,
// |address| must be aligned to |PageAllocationGranularity()| bytes, and
// |length| must be a multiple of |PageAllocationGranularity()|.
PA_COMPONENT_EXPORT(PARTITION_ALLOC)
void FreePages(uintptr_t address, size_t length);
PA_COMPONENT_EXPORT(PARTITION_ALLOC)
void FreePages(void* address, size_t length);

// Marks one or more system pages, starting at |address| with the given
// |page_accessibility|. |length| must be a multiple of |SystemPageSize()|
// bytes.
//
// Returns true if the permission change succeeded. In most cases you must
// |CHECK| the result.
//
// Note: On Windows, setting permissions to `PAGE_NOACCESS` will also decommit
// pages. This is desirable because clients assume that pages with no access
// rights should be "free" from a resource standpoint. In particular this allows
// clients to map a large amount of memory, set its access rights to
// `PAGE_NOACCESS` and not worry about commit limit exhaustion.
// On the flip side, this means that changing permissions can often fail on this
// platform.
[[nodiscard]] PA_COMPONENT_EXPORT(PARTITION_ALLOC) bool TrySetSystemPagesAccess(
    uintptr_t address,
    size_t length,
    PageAccessibilityConfiguration page_accessibility);
[[nodiscard]] PA_COMPONENT_EXPORT(PARTITION_ALLOC) bool TrySetSystemPagesAccess(
    void* address,
    size_t length,
    PageAccessibilityConfiguration page_accessibility);

// Marks one or more system pages, starting at |address| with the given
// |page_accessibility|. |length| must be a multiple of |SystemPageSize()|
// bytes.
//
// Performs a CHECK that the operation succeeds.
//
// See the note above for Windows-specific behavior.
PA_COMPONENT_EXPORT(PARTITION_ALLOC)
void SetSystemPagesAccess(uintptr_t address,
                          size_t length,
                          PageAccessibilityConfiguration page_accessibility);
PA_COMPONENT_EXPORT(PARTITION_ALLOC)
void SetSystemPagesAccess(void* address,
                          size_t length,
                          PageAccessibilityConfiguration page_accessibility);

// Decommits one or more system pages starting at |address| and continuing for
// |length| bytes. |address| and |length| must be aligned to a system page
// boundary.
//
// This API will crash if the operation cannot be performed!
//
// If disposition is PageAccessibilityDisposition::kRequireUpdate (recommended),
// the decommitted pages will be made inaccessible before the call returns.
// While it is always a programming error to access decommitted pages without
// first recommitting them, callers may use
// PageAccessibilityDisposition::kAllowKeepForPerf to allow the implementation
// to skip changing permissions (use with care), for performance reasons (see
// crrev.com/c/2567282 and crrev.com/c/2563038 for perf regressions encountered
// in the past). Implementations may choose to always modify permissions, hence
// accessing those pages may or may not trigger a fault.
//
// Decommitting means that physical resources (RAM or swap/pagefile) backing the
// allocated virtual address range may be released back to the system, but the
// address space is still allocated to the process (possibly using up page table
// entries or other accounting resources). There is no guarantee that the pages
// are zeroed, unless |DecommittedMemoryIsAlwaysZeroed()| is true.
//
// This operation may not be atomic on some platforms.
//
// Note: "Committed memory" is a Windows Memory Subsystem concept that ensures
// processes will not fault when touching a committed memory region. There is
// no analogue in the POSIX & Fuchsia memory API where virtual memory pages are
// best-effort allocated resources on the first touch. If
// PageAccessibilityDisposition::kRequireUpdate disposition is used, this API
// behaves in a platform-agnostic way by simulating the Windows "decommit" state
// by both discarding the region (allowing the OS to avoid swap operations)
// *and* changing the page protections so accesses fault.
PA_COMPONENT_EXPORT(PARTITION_ALLOC)
void DecommitSystemPages(
    uintptr_t address,
    size_t length,
    PageAccessibilityDisposition accessibility_disposition);
PA_COMPONENT_EXPORT(PARTITION_ALLOC)
void DecommitSystemPages(
    void* address,
    size_t length,
    PageAccessibilityDisposition accessibility_disposition);

// Decommits one or more system pages starting at |address| and continuing for
// |length| bytes. |address| and |length| must be aligned to a system page
// boundary.
//
// In contrast to |DecommitSystemPages|, this API guarantees that the pages are
// zeroed and will always mark the region as inaccessible (the equivalent of
// setting them to PageAccessibilityConfiguration::kInaccessible).
PA_COMPONENT_EXPORT(PARTITION_ALLOC)
bool DecommitAndZeroSystemPages(uintptr_t address,
                                size_t length,
                                PageTag page_tag = PageTag::kChromium);
PA_COMPONENT_EXPORT(PARTITION_ALLOC)
bool DecommitAndZeroSystemPages(void* address,
                                size_t length,
                                PageTag page_tag = PageTag::kChromium);

// Whether decommitted memory is guaranteed to be zeroed when it is
// recommitted. Do not assume that this will not change over time.
constexpr PA_COMPONENT_EXPORT(
    PARTITION_ALLOC) bool DecommittedMemoryIsAlwaysZeroed() {
#if PA_BUILDFLAG(IS_APPLE)
  return false;
#else
  return true;
#endif
}

// (Re)Commits one or more system pages, starting at |address| and continuing
// for |length| bytes with the given |page_accessibility| (must not be
// PageAccessibilityConfiguration::kInaccessible). |address| and |length|
// must be aligned to a system page boundary.
//
// This API will crash if the operation cannot be performed!
//
// If disposition is PageAccessibilityConfiguration::kRequireUpdate, the calls
// updates the pages to |page_accessibility|. This can be used regardless of
// what disposition was used to decommit the pages.
// PageAccessibilityConfiguration::kAllowKeepForPerf allows the implementation
// to leave the page permissions, if that improves performance. This option can
// only be used if the pages were previously accessible and decommitted with
// that same option.
//
// The memory will be zeroed when it is committed for the first time. However,
// there is no such guarantee when memory is recommitted, unless
// |DecommittedMemoryIsAlwaysZeroed()| is true.
//
// This operation may not be atomic on some platforms.
PA_COMPONENT_EXPORT(PARTITION_ALLOC)
void RecommitSystemPages(
    uintptr_t address,
    size_t length,
    PageAccessibilityConfiguration page_accessibility,
    PageAccessibilityDisposition accessibility_disposition);

// Like RecommitSystemPages(), but returns false instead of crashing.
[[nodiscard]] PA_COMPONENT_EXPORT(PARTITION_ALLOC) bool TryRecommitSystemPages(
    uintptr_t address,
    size_t length,
    PageAccessibilityConfiguration page_accessibility,
    PageAccessibilityDisposition accessibility_disposition);

// Discard one or more system pages starting at |address| and continuing for
// |length| bytes. |length| must be a multiple of |SystemPageSize()|.
//
// Discarding is a hint to the system that the page is no longer required. The
// hint may:
//   - Do nothing.
//   - Discard the page immediately, freeing up physical pages.
//   - Discard the page at some time in the future in response to memory
//   pressure.
//
// Only committed pages should be discarded. Discarding a page does not decommit
// it, and it is valid to discard an already-discarded page. A read or write to
// a discarded page will not fault.
//
// Reading from a discarded page may return the original page content, or a page
// full of zeroes.
//
// Writing to a discarded page is the only guaranteed way to tell the system
// that the page is required again. Once written to, the content of the page is
// guaranteed stable once more. After being written to, the page content may be
// based on the original page content, or a page of zeroes.
//
// WARNING: Do not discard a large amount of pages, for a potentially long
// duration. Discarded pages are *not* decommitted on Windows, where total
// system-wide committed memory is limited. As most Chromium OOM crashes are
// commit limit related, this will both impact Private Memory Footprint (which
// reports committed memory) and stability (since we will bump into the limit
// more often).
PA_COMPONENT_EXPORT(PARTITION_ALLOC)
void DiscardSystemPages(uintptr_t address, size_t length);
PA_COMPONENT_EXPORT(PARTITION_ALLOC)
void DiscardSystemPages(void* address, size_t length);

// Seal a number of system pages starting at |address|. Returns |true| on
// success.
//
// This blocks various modifications to the pages such as unmapping, remapping
// or changing page permissions. Note that it doesn't change the accessibility
// of the memory, sealed writable pages will still be writable.
//
// This is mainly useful for non-writable memory (either via page permissions or
// other hardware features like pkeys) that is bound to the process lifetime.
//
// While unmapping the pages gets blocked, it can still be possible to release
// the memory using |DiscardSystemPages()|, though note that at least on Linux,
// it requires write access to the page to succeed.
PA_COMPONENT_EXPORT(PARTITION_ALLOC)
bool SealSystemPages(uintptr_t address, size_t length);
PA_COMPONENT_EXPORT(PARTITION_ALLOC)
bool SealSystemPages(void* address, size_t length);

// Rounds up |address| to the next multiple of |SystemPageSize()|. Returns
// 0 for an |address| of 0.
PA_ALWAYS_INLINE PAGE_ALLOCATOR_CONSTANTS_DECLARE_CONSTEXPR uintptr_t
RoundUpToSystemPage(uintptr_t address) {
  return (address + internal::SystemPageOffsetMask()) &
         internal::SystemPageBaseMask();
}

// Rounds down |address| to the previous multiple of |SystemPageSize()|. Returns
// 0 for an |address| of 0.
PA_ALWAYS_INLINE PAGE_ALLOCATOR_CONSTANTS_DECLARE_CONSTEXPR uintptr_t
RoundDownToSystemPage(uintptr_t address) {
  return address & internal::SystemPageBaseMask();
}

// Rounds up |address| to the next multiple of |PageAllocationGranularity()|.
// Returns 0 for an |address| of 0.
PA_ALWAYS_INLINE PAGE_ALLOCATOR_CONSTANTS_DECLARE_CONSTEXPR uintptr_t
RoundUpToPageAllocationGranularity(uintptr_t address) {
  return (address + internal::PageAllocationGranularityOffsetMask()) &
         internal::PageAllocationGranularityBaseMask();
}

// Rounds down |address| to the previous multiple of
// |PageAllocationGranularity()|. Returns 0 for an |address| of 0.
PA_ALWAYS_INLINE PAGE_ALLOCATOR_CONSTANTS_DECLARE_CONSTEXPR uintptr_t
RoundDownToPageAllocationGranularity(uintptr_t address) {
  return address & internal::PageAllocationGranularityBaseMask();
}

// Reserves (at least) |size| bytes of address space, aligned to
// |PageAllocationGranularity()|. This can be called early on to make it more
// likely that large allocations will succeed. Returns true if the reservation
// succeeded, false if the reservation failed or a reservation was already made.
PA_COMPONENT_EXPORT(PARTITION_ALLOC) bool ReserveAddressSpace(size_t size);

// Releases any reserved address space. |AllocPages| calls this automatically on
// an allocation failure. External allocators may also call this on failure.
//
// Returns true when an existing reservation was released.
PA_COMPONENT_EXPORT(PARTITION_ALLOC) bool ReleaseReservation();

// Returns true if there is currently an address space reservation.
PA_COMPONENT_EXPORT(PARTITION_ALLOC) bool HasReservationForTesting();

// Returns |errno| (POSIX) or the result of |GetLastError| (Windows) when |mmap|
// (POSIX) or |VirtualAlloc| (Windows) fails.
PA_COMPONENT_EXPORT(PARTITION_ALLOC) uint32_t GetAllocPageErrorCode();

// Returns the total amount of mapped pages from all clients of
// PageAllocator. These pages may or may not be committed. This is mostly useful
// to assess address space pressure.
PA_COMPONENT_EXPORT(PARTITION_ALLOC) size_t GetTotalMappedSize();

#if PA_BUILDFLAG(IS_WIN)
// Sets whether to retry the allocation of pages when a commit failure
// happens. This doesn't cover cases where the system is out of address space,
// or reaches another limit.
PA_COMPONENT_EXPORT(PARTITION_ALLOC)
void SetRetryOnCommitFailure(bool retry_on_commit_failure);
bool GetRetryOnCommitFailure();
#endif  // PA_BUILDFLAG(IS_WIN)

}  // namespace partition_alloc

#endif  // PARTITION_ALLOC_PAGE_ALLOCATOR_H_