1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
base / i18n / string_search.cc [blame]
// Copyright 2011 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/i18n/string_search.h"
#include <stdint.h>
#include <string>
#include <string_view>
#include <utility>
#include "base/check.h"
#include "base/check_op.h"
#include "third_party/icu/source/i18n/unicode/usearch.h"
namespace base {
namespace i18n {
FixedPatternStringSearch::FixedPatternStringSearch(std::u16string find_this,
bool case_sensitive)
: find_this_(std::move(find_this)) {
UErrorCode status = U_ZERO_ERROR;
search_ =
usearch_open(find_this_.data(), find_this_.size(),
// `usearch_open()` requires a valid string argument to be
// searched, even if we want to set it by `usearch_setText()`
// afterwards. So just provide `find_this_` again.
find_this_.data(), find_this_.size(), uloc_getDefault(),
/*breakiter=*/nullptr, &status);
if (U_SUCCESS(status)) {
// http://icu-project.org/apiref/icu4c40/ucol_8h.html#6a967f36248b0a1bc7654f538ee8ba96
// Set comparison level to UCOL_PRIMARY to ignore secondary and tertiary
// differences. Set comparison level to UCOL_TERTIARY to include all
// comparison differences.
// Diacritical differences on the same base letter represent a
// secondary difference.
// Uppercase and lowercase versions of the same character represents a
// tertiary difference.
UCollator* collator = usearch_getCollator(search_);
ucol_setStrength(collator, case_sensitive ? UCOL_TERTIARY : UCOL_PRIMARY);
usearch_reset(search_);
}
}
FixedPatternStringSearch::~FixedPatternStringSearch() {
if (search_)
usearch_close(search_.ExtractAsDangling());
}
bool FixedPatternStringSearch::Search(std::u16string_view in_this,
size_t* match_index,
size_t* match_length,
bool forward_search) {
UErrorCode status = U_ZERO_ERROR;
usearch_setText(search_, in_this.data(), in_this.size(), &status);
// Default to basic substring search if usearch fails. According to
// http://icu-project.org/apiref/icu4c/usearch_8h.html, usearch_open will fail
// if either |find_this| or |in_this| are empty. In either case basic
// substring search will give the correct return value.
if (!U_SUCCESS(status)) {
size_t index = in_this.find(find_this_);
if (index == std::u16string::npos)
return false;
if (match_index)
*match_index = index;
if (match_length)
*match_length = find_this_.size();
return true;
}
int32_t index = forward_search ? usearch_first(search_, &status)
: usearch_last(search_, &status);
if (!U_SUCCESS(status) || index == USEARCH_DONE)
return false;
if (match_index)
*match_index = static_cast<size_t>(index);
if (match_length)
*match_length = static_cast<size_t>(usearch_getMatchedLength(search_));
return true;
}
FixedPatternStringSearchIgnoringCaseAndAccents::
FixedPatternStringSearchIgnoringCaseAndAccents(std::u16string find_this)
: base_search_(std::move(find_this), /*case_sensitive=*/false) {}
bool FixedPatternStringSearchIgnoringCaseAndAccents::Search(
std::u16string_view in_this,
size_t* match_index,
size_t* match_length) {
return base_search_.Search(in_this, match_index, match_length,
/*forward_search=*/true);
}
bool StringSearchIgnoringCaseAndAccents(std::u16string find_this,
std::u16string_view in_this,
size_t* match_index,
size_t* match_length) {
return FixedPatternStringSearchIgnoringCaseAndAccents(std::move(find_this))
.Search(in_this, match_index, match_length);
}
bool StringSearch(std::u16string find_this,
std::u16string_view in_this,
size_t* match_index,
size_t* match_length,
bool case_sensitive,
bool forward_search) {
return FixedPatternStringSearch(std::move(find_this), case_sensitive)
.Search(in_this, match_index, match_length, forward_search);
}
RepeatingStringSearch::RepeatingStringSearch(std::u16string find_this,
std::u16string in_this,
bool case_sensitive)
: find_this_(std::move(find_this)), in_this_(std::move(in_this)) {
std::string locale = uloc_getDefault();
UErrorCode status = U_ZERO_ERROR;
search_ = usearch_open(find_this_.data(), find_this_.size(), in_this_.data(),
in_this_.size(), locale.data(), /*breakiter=*/nullptr,
&status);
DCHECK(U_SUCCESS(status));
if (U_SUCCESS(status)) {
// http://icu-project.org/apiref/icu4c40/ucol_8h.html#6a967f36248b0a1bc7654f538ee8ba96
// Set comparison level to UCOL_PRIMARY to ignore secondary and tertiary
// differences. Set comparison level to UCOL_TERTIARY to include all
// comparison differences.
// Diacritical differences on the same base letter represent a
// secondary difference.
// Uppercase and lowercase versions of the same character represents a
// tertiary difference.
UCollator* collator = usearch_getCollator(search_);
ucol_setStrength(collator, case_sensitive ? UCOL_TERTIARY : UCOL_PRIMARY);
usearch_reset(search_);
}
}
RepeatingStringSearch::~RepeatingStringSearch() {
if (search_)
usearch_close(search_.ExtractAsDangling());
}
bool RepeatingStringSearch::NextMatchResult(int& match_index,
int& match_length) {
UErrorCode status = U_ZERO_ERROR;
const int match_start = usearch_next(search_, &status);
if (U_FAILURE(status) || match_start == USEARCH_DONE)
return false;
DCHECK(U_SUCCESS(status));
match_index = match_start;
match_length = usearch_getMatchedLength(search_);
return true;
}
} // namespace i18n
} // namespace base