1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
base / i18n / break_iterator.h [blame]
// Copyright 2011 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef BASE_I18N_BREAK_ITERATOR_H_
#define BASE_I18N_BREAK_ITERATOR_H_
#include <stddef.h>
#include <memory>
#include <string>
#include <string_view>
#include "base/i18n/base_i18n_export.h"
#include "base/memory/raw_ptr.h"
// The BreakIterator class iterates through the words, word breaks, and
// line breaks in a UTF-16 string.
//
// It provides several modes, BREAK_WORD, BREAK_LINE, BREAK_NEWLINE, and
// BREAK_SENTENCE which modify how characters are aggregated into the returned
// string.
//
// Under BREAK_WORD mode, once a word is encountered any non-word
// characters are not included in the returned string (e.g. in the
// UTF-16 equivalent of the string " foo bar! ", the word breaks are at
// the periods in ". .foo. .bar.!. .").
// Note that Chinese/Japanese/Thai do not use spaces between words so that
// boundaries can fall in the middle of a continuous run of non-space /
// non-punctuation characters.
//
// Under BREAK_LINE mode, once a line breaking opportunity is encountered,
// any non-word characters are included in the returned string, breaking
// only when a space-equivalent character or a line breaking opportunity
// is encountered (e.g. in the UTF16-equivalent of the string " foo bar! ",
// the breaks are at the periods in ". .foo .bar! .").
//
// Note that lines can be broken at any character/syllable/grapheme cluster
// boundary in Chinese/Japanese/Korean and at word boundaries in Thai
// (Thai does not use spaces between words). Therefore, this is NOT the same
// as breaking only at space-equivalent characters where its former
// name (BREAK_SPACE) implied.
//
// Under BREAK_NEWLINE mode, all characters are included in the returned
// string, breaking only when a newline-equivalent character is encountered
// (eg. in the UTF-16 equivalent of the string "foo\nbar!\n\n", the line
// breaks are at the periods in ".foo\n.bar\n.\n.").
//
// Under BREAK_SENTENCE mode, all characters are included in the returned
// string, breaking only on sentence boundaries defined in "Unicode Standard
// Annex #29: Text Segmentation." Whitespace immediately following the sentence
// is also included. For example, in the UTF-16 equivalent of the string
// "foo bar! baz qux?" the breaks are at the periods in ".foo bar! .baz quz?."
//
// To extract the words from a string, move a BREAK_WORD BreakIterator
// through the string and test whether IsWord() is true. E.g.,
// BreakIterator iter(str, BreakIterator::BREAK_WORD);
// if (!iter.Init())
// return false;
// while (iter.Advance()) {
// if (iter.IsWord()) {
// // Region [iter.prev(), iter.pos()) contains a word.
// VLOG(1) << "word: " << iter.GetString();
// }
// }
// ICU iterator type. It is forward declared to avoid including transitively the
// full ICU headers toward every dependent files.
struct UBreakIterator;
namespace base {
namespace i18n {
struct UBreakIteratorDeleter {
void operator()(UBreakIterator*);
};
using UBreakIteratorPtr =
std::unique_ptr<UBreakIterator, UBreakIteratorDeleter>;
class BASE_I18N_EXPORT BreakIterator {
public:
enum BreakType {
BREAK_WORD,
BREAK_LINE,
// TODO(jshin): Remove this after reviewing call sites.
// If call sites really need break only on space-like characters
// implement it separately.
BREAK_SPACE = BREAK_LINE,
BREAK_NEWLINE,
BREAK_CHARACTER,
// But don't remove this one!
RULE_BASED,
BREAK_SENTENCE,
};
enum WordBreakStatus {
// The end of text that the iterator recognizes as word characters.
// Non-word characters are things like punctuation and spaces.
IS_WORD_BREAK,
// Characters that the iterator can skip past, such as punctuation,
// whitespace, and, if using RULE_BASED mode, characters from another
// character set.
IS_SKIPPABLE_WORD,
// Only used if not in BREAK_WORD or RULE_BASED mode. This is returned for
// newlines, line breaks, and character breaks.
IS_LINE_OR_CHAR_BREAK
};
static constexpr size_t npos = static_cast<size_t>(-1);
// Requires |str| to live as long as the BreakIterator does.
BreakIterator(std::u16string_view str, BreakType break_type);
// Make a rule-based iterator. BreakType == RULE_BASED is implied.
// TODO(andrewhayden): This signature could easily be misinterpreted as
// "(const std::u16string& str, const std::u16string& locale)". We should do
// something better.
BreakIterator(std::u16string_view str, const std::u16string& rules);
BreakIterator(const BreakIterator&) = delete;
BreakIterator& operator=(const BreakIterator&) = delete;
~BreakIterator();
// Init() must be called before any of the iterators are valid.
// Returns false if ICU failed to initialize.
bool Init();
// Advance to the next break. Returns false if we've run past the end of
// the string. (Note that the very last "break" is after the final
// character in the string, and when we advance to that position it's the
// last time Advance() returns true.)
bool Advance();
// Updates the text used by the iterator, resetting the iterator as if
// if Init() had been called again. Any old state is lost. Returns true
// unless there is an error setting the text.
bool SetText(std::u16string_view text);
// Under BREAK_WORD mode, returns true if the break we just hit is the
// end of a word. (Otherwise, the break iterator just skipped over e.g.
// whitespace or punctuation.) Under BREAK_LINE and BREAK_NEWLINE modes,
// this distinction doesn't apply and it always returns false.
bool IsWord() const;
// Under BREAK_WORD mode:
// - Returns IS_SKIPPABLE_WORD if non-word characters, such as punctuation or
// spaces, are found.
// - Returns IS_WORD_BREAK if the break we just hit is the end of a sequence
// of word characters.
// Under RULE_BASED mode:
// - Returns IS_SKIPPABLE_WORD if characters outside the rules' character set
// or non-word characters, such as punctuation or spaces, are found.
// - Returns IS_WORD_BREAK if the break we just hit is the end of a sequence
// of word characters that are in the rules' character set.
// Not under BREAK_WORD or RULE_BASED mode:
// - Returns IS_LINE_OR_CHAR_BREAK.
BreakIterator::WordBreakStatus GetWordBreakStatus() const;
// Under BREAK_WORD mode, returns true if |position| is at the end of word or
// at the start of word. It always returns false under modes that are not
// BREAK_WORD or RULE_BASED.
bool IsEndOfWord(size_t position) const;
bool IsStartOfWord(size_t position) const;
// Under BREAK_SENTENCE mode, returns true if |position| is at a sentence
// boundary. It always returns false under modes that are not BREAK_SENTENCE
// or RULE_BASED.
bool IsSentenceBoundary(size_t position) const;
// Under BREAK_CHARACTER mode, returns whether |position| is a Unicode
// grapheme boundary.
bool IsGraphemeBoundary(size_t position) const;
// Returns the string between prev() and pos().
// Advance() must have been called successfully at least once for pos() to
// have advanced to somewhere useful.
std::u16string_view GetString() const;
// Returns the value of pos() returned before Advance() was last called.
size_t prev() const { return prev_; }
// Returns the current break position within the string,
// or BreakIterator::npos when done.
size_t pos() const { return pos_; }
private:
UBreakIteratorPtr iter_;
// The string we're iterating over. Can be changed with SetText(...)
std::u16string_view string_;
// Rules for our iterator. Mutually exclusive with break_type_.
const std::u16string rules_;
// The breaking style (word/space/newline). Mutually exclusive with rules_
const BreakType break_type_;
// Previous and current iterator positions.
size_t prev_ = npos;
size_t pos_ = 0;
};
} // namespace i18n
} // namespace base
#endif // BASE_I18N_BREAK_ITERATOR_H_