-
Notifications
You must be signed in to change notification settings - Fork 16
Expand file tree
/
Copy pathsplit.go
More file actions
242 lines (225 loc) · 7.35 KB
/
split.go
File metadata and controls
242 lines (225 loc) · 7.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package strings
import (
"solod.dev/so/mem"
"solod.dev/so/unicode"
"solod.dev/so/unicode/utf8"
)
var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1}
// Split slices s into all substrings separated by sep and returns a slice of
// the substrings between those separators.
//
// If s does not contain sep and sep is not empty, Split returns a
// slice of length 1 whose only element is s.
//
// If sep is empty, Split splits after each UTF-8 sequence. If both s
// and sep are empty, Split returns an empty slice.
//
// It is equivalent to [SplitN] with a count of -1.
//
// To split around the first instance of a separator, see [Cut].
//
// If the allocator is nil, uses the system allocator.
// The returned slice is allocated; the caller owns it.
// The substrings in the slice are references to the original string s.
func Split(a mem.Allocator, s, sep string) []string { return genSplit(a, s, sep, 0, -1) }
// SplitN slices s into substrings separated by sep and returns a slice of
// the substrings between those separators.
//
// The count determines the number of substrings to return:
// - n > 0: at most n substrings; the last substring will be the unsplit remainder;
// - n == 0: the result is nil (zero substrings);
// - n < 0: all substrings.
//
// Edge cases for s and sep (for example, empty strings) are handled
// as described in the documentation for [Split].
//
// To split around the first instance of a separator, see [Cut].
//
// If the allocator is nil, uses the system allocator.
// The returned slice is allocated; the caller owns it.
// The substrings in the slice are references to the original string s.
func SplitN(a mem.Allocator, s, sep string, n int) []string { return genSplit(a, s, sep, 0, n) }
// SplitAfter slices s into all substrings after each instance of sep and
// returns a slice of those substrings.
//
// If s does not contain sep and sep is not empty, SplitAfter returns
// a slice of length 1 whose only element is s.
//
// If sep is empty, SplitAfter splits after each UTF-8 sequence. If
// both s and sep are empty, SplitAfter returns an empty slice.
//
// It is equivalent to [SplitAfterN] with a count of -1.
//
// If the allocator is nil, uses the system allocator.
// The returned slice is allocated; the caller owns it.
// The substrings in the slice are references to the original string s.
func SplitAfter(a mem.Allocator, s, sep string) []string {
return genSplit(a, s, sep, len(sep), -1)
}
// span is used to record a slice of s of the form s[start:end].
// The start index is inclusive and the end index is exclusive.
type span struct {
start int
end int
}
// Fields splits the string s around each instance of one or more consecutive white space
// characters, as defined by [unicode.IsSpace], returning a slice of substrings of s or an
// empty slice if s contains only white space. Every element of the returned slice is
// non-empty. Unlike [Split], leading and trailing runs of white space characters
// are discarded.
//
// If the allocator is nil, uses the system allocator.
// The returned slice is allocated; the caller owns it.
// The substrings in the slice are references to the original string s.
func Fields(a mem.Allocator, s string) []string {
// First count the fields.
// This is an exact count if s is ASCII, otherwise it is an approximation.
n := 0
wasSpace := 1
// setBits is used to track which bits are set in the bytes of s.
setBits := uint8(0)
for i := 0; i < len(s); i++ {
r := s[i]
setBits |= r
isSpace := int(asciiSpace[r])
n += wasSpace & ^isSpace
wasSpace = isSpace
}
if setBits >= utf8.RuneSelf {
// Some runes in the input string are not ASCII.
return FieldsFunc(a, s, unicode.IsSpace)
}
// ASCII fast path
res := mem.AllocSlice[string](a, n, n)
na := 0
fieldStart := 0
i := 0
// Skip spaces in the front of the input.
for i < len(s) && asciiSpace[s[i]] != 0 {
i++
}
fieldStart = i
for i < len(s) {
if asciiSpace[s[i]] == 0 {
i++
continue
}
res[na] = s[fieldStart:i]
na++
i++
// Skip spaces in between fields.
for i < len(s) && asciiSpace[s[i]] != 0 {
i++
}
fieldStart = i
}
if fieldStart < len(s) { // Last field might end at EOF.
res[na] = s[fieldStart:]
}
return res
}
// FieldsFunc splits the string s at each run of Unicode code points c satisfying f(c)
// and returns an array of slices of s. If all code points in s satisfy f(c) or the
// string is empty, an empty slice is returned. Every element of the returned slice is
// non-empty. Unlike [Split], leading and trailing runs of code points satisfying f(c)
// are discarded.
//
// FieldsFunc makes no guarantees about the order in which it calls f(c)
// and assumes that f always returns the same value for a given c.
//
// If the allocator is nil, uses the system allocator.
// The returned slice is allocated; the caller owns it.
// The substrings in the slice are references to the original string s.
func FieldsFunc(a mem.Allocator, s string, f RunePredicate) []string {
spans := make([]span, 0, 32)
// Find the field start and end indices.
// Doing this in a separate pass (rather than slicing the string s
// and collecting the result substrings right away) is significantly
// more efficient, possibly due to cache effects.
start := -1 // valid span start if >= 0
for end, rune := range s {
if f(rune) {
if start >= 0 {
spans = append(spans, span{start, end})
// Set start to a negative value.
// Note: using -1 here consistently and reproducibly
// slows down this code by a several percent on amd64.
start = ^start
}
} else {
if start < 0 {
start = end
}
}
}
// Last field might end at EOF.
if start >= 0 {
spans = append(spans, span{start, len(s)})
}
// Create strings from recorded field indices.
res := mem.AllocSlice[string](a, len(spans), len(spans))
for i, sp := range spans {
res[i] = s[sp.start:sp.end]
}
return res
}
// Generic split: splits after each instance of sep,
// including sepSave bytes of sep in the subarrays.
//
// If the allocator is nil, uses the system allocator.
// The returned slice is allocated; the caller owns it.
// The substrings in the slice are references to the original string s.
func genSplit(a mem.Allocator, s, sep string, sepSave, n int) []string {
if n == 0 {
return nil
}
if sep == "" {
return explode(a, s, n)
}
if n < 0 {
n = Count(s, sep) + 1
}
if n > len(s)+1 {
n = len(s) + 1
}
res := mem.AllocSlice[string](a, n, n)
n--
i := 0
for i < n {
m := Index(s, sep)
if m < 0 {
break
}
res[i] = s[:m+sepSave]
s = s[m+len(sep):]
i++
}
res[i] = s
return res[:i+1]
}
// explode splits s into a slice of UTF-8 strings,
// one string per Unicode character up to a maximum of n (n < 0 means no limit).
// Invalid UTF-8 bytes are sliced individually.
//
// If the allocator is nil, uses the system allocator.
// The returned slice is allocated; the caller owns it.
// The substrings in the slice are references to the original string s.
func explode(a mem.Allocator, s string, n int) []string {
l := utf8.RuneCountInString(s)
if n < 0 || n > l {
n = l
}
res := mem.AllocSlice[string](a, n, n)
for i := 0; i < n-1; i++ {
_, size := utf8.DecodeRuneInString(s)
res[i] = s[:size]
s = s[size:]
}
if n > 0 {
res[n-1] = s
}
return res
}