forked from toolshed/abra
chore: go mod tidy / vendor / make deps
This commit is contained in:
21
vendor/github.com/clipperhouse/uax29/v2/LICENSE
generated
vendored
Normal file
21
vendor/github.com/clipperhouse/uax29/v2/LICENSE
generated
vendored
Normal file
@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2020 Matt Sherman
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
82
vendor/github.com/clipperhouse/uax29/v2/graphemes/README.md
generated
vendored
Normal file
82
vendor/github.com/clipperhouse/uax29/v2/graphemes/README.md
generated
vendored
Normal file
@ -0,0 +1,82 @@
|
||||
An implementation of grapheme cluster boundaries from [Unicode text segmentation](https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries) (UAX 29), for Unicode version 15.0.0.
|
||||
|
||||
## Quick start
|
||||
|
||||
```
|
||||
go get "github.com/clipperhouse/uax29/v2/graphemes"
|
||||
```
|
||||
|
||||
```go
|
||||
import "github.com/clipperhouse/uax29/v2/graphemes"
|
||||
|
||||
text := "Hello, 世界. Nice dog! 👍🐶"
|
||||
|
||||
tokens := graphemes.FromString(text)
|
||||
|
||||
for tokens.Next() { // Next() returns true until end of data
|
||||
fmt.Println(tokens.Value()) // Do something with the current grapheme
|
||||
}
|
||||
```
|
||||
|
||||
[](https://pkg.go.dev/github.com/clipperhouse/uax29/v2/graphemes)
|
||||
|
||||
_A grapheme is a “single visible character”, which might be a simple as a single letter, or a complex emoji that consists of several Unicode code points._
|
||||
|
||||
## Conformance
|
||||
|
||||
We use the Unicode [test suite](https://unicode.org/reports/tr41/tr41-26.html#Tests29). Status:
|
||||
|
||||

|
||||
|
||||
## APIs
|
||||
|
||||
### If you have a `string`
|
||||
|
||||
```go
|
||||
text := "Hello, 世界. Nice dog! 👍🐶"
|
||||
|
||||
tokens := graphemes.FromString(text)
|
||||
|
||||
for tokens.Next() { // Next() returns true until end of data
|
||||
fmt.Println(tokens.Value()) // Do something with the current grapheme
|
||||
}
|
||||
```
|
||||
|
||||
### If you have an `io.Reader`
|
||||
|
||||
`FromReader` embeds a [`bufio.Scanner`](https://pkg.go.dev/bufio#Scanner), so just use those methods.
|
||||
|
||||
```go
|
||||
r := getYourReader() // from a file or network maybe
|
||||
tokens := graphemes.FromReader(r)
|
||||
|
||||
for tokens.Scan() { // Scan() returns true until error or EOF
|
||||
fmt.Println(tokens.Text()) // Do something with the current grapheme
|
||||
}
|
||||
|
||||
if tokens.Err() != nil { // Check the error
|
||||
log.Fatal(tokens.Err())
|
||||
}
|
||||
```
|
||||
|
||||
### If you have a `[]byte`
|
||||
|
||||
```go
|
||||
b := []byte("Hello, 世界. Nice dog! 👍🐶")
|
||||
|
||||
tokens := graphemes.FromBytes(b)
|
||||
|
||||
for tokens.Next() { // Next() returns true until end of data
|
||||
fmt.Println(tokens.Value()) // Do something with the current grapheme
|
||||
}
|
||||
```
|
||||
|
||||
### Performance
|
||||
|
||||
On a Mac M2 laptop, we see around 200MB/s, or around 100 million graphemes per second. You should see ~constant memory, and no allocations.
|
||||
|
||||
### Invalid inputs
|
||||
|
||||
Invalid UTF-8 input is considered undefined behavior. We test to ensure that bad inputs will not cause pathological outcomes, such as a panic or infinite loop. Callers should expect “garbage-in, garbage-out”.
|
||||
|
||||
Your pipeline should probably include a call to [`utf8.Valid()`](https://pkg.go.dev/unicode/utf8#Valid).
|
||||
28
vendor/github.com/clipperhouse/uax29/v2/graphemes/iterator.go
generated
vendored
Normal file
28
vendor/github.com/clipperhouse/uax29/v2/graphemes/iterator.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
||||
package graphemes
|
||||
|
||||
import "github.com/clipperhouse/uax29/v2/internal/iterators"
|
||||
|
||||
type Iterator[T iterators.Stringish] struct {
|
||||
*iterators.Iterator[T]
|
||||
}
|
||||
|
||||
var (
|
||||
splitFuncString = splitFunc[string]
|
||||
splitFuncBytes = splitFunc[[]byte]
|
||||
)
|
||||
|
||||
// FromString returns an iterator for the grapheme clusters in the input string.
|
||||
// Iterate while Next() is true, and access the grapheme via Value().
|
||||
func FromString(s string) Iterator[string] {
|
||||
return Iterator[string]{
|
||||
iterators.New(splitFuncString, s),
|
||||
}
|
||||
}
|
||||
|
||||
// FromBytes returns an iterator for the grapheme clusters in the input bytes.
|
||||
// Iterate while Next() is true, and access the grapheme via Value().
|
||||
func FromBytes(b []byte) Iterator[[]byte] {
|
||||
return Iterator[[]byte]{
|
||||
iterators.New(splitFuncBytes, b),
|
||||
}
|
||||
}
|
||||
25
vendor/github.com/clipperhouse/uax29/v2/graphemes/reader.go
generated
vendored
Normal file
25
vendor/github.com/clipperhouse/uax29/v2/graphemes/reader.go
generated
vendored
Normal file
@ -0,0 +1,25 @@
|
||||
// Package graphemes implements Unicode grapheme cluster boundaries: https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
|
||||
package graphemes
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"io"
|
||||
)
|
||||
|
||||
type Scanner struct {
|
||||
*bufio.Scanner
|
||||
}
|
||||
|
||||
// FromReader returns a Scanner, to split graphemes per
|
||||
// https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries.
|
||||
//
|
||||
// It embeds a [bufio.Scanner], so you can use its methods.
|
||||
//
|
||||
// Iterate through graphemes by calling Scan() until false, then check Err().
|
||||
func FromReader(r io.Reader) *Scanner {
|
||||
sc := bufio.NewScanner(r)
|
||||
sc.Split(SplitFunc)
|
||||
return &Scanner{
|
||||
Scanner: sc,
|
||||
}
|
||||
}
|
||||
174
vendor/github.com/clipperhouse/uax29/v2/graphemes/splitfunc.go
generated
vendored
Normal file
174
vendor/github.com/clipperhouse/uax29/v2/graphemes/splitfunc.go
generated
vendored
Normal file
@ -0,0 +1,174 @@
|
||||
package graphemes
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
|
||||
"github.com/clipperhouse/uax29/v2/internal/iterators"
|
||||
)
|
||||
|
||||
// is determines if lookup intersects propert(ies)
|
||||
func (lookup property) is(properties property) bool {
|
||||
return (lookup & properties) != 0
|
||||
}
|
||||
|
||||
const _Ignore = _Extend
|
||||
|
||||
// SplitFunc is a bufio.SplitFunc implementation of Unicode grapheme cluster segmentation, for use with bufio.Scanner.
|
||||
//
|
||||
// See https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries.
|
||||
var SplitFunc bufio.SplitFunc = splitFunc[[]byte]
|
||||
|
||||
func splitFunc[T iterators.Stringish](data T, atEOF bool) (advance int, token T, err error) {
|
||||
var empty T
|
||||
if len(data) == 0 {
|
||||
return 0, empty, nil
|
||||
}
|
||||
|
||||
// These vars are stateful across loop iterations
|
||||
var pos int
|
||||
var lastExIgnore property = 0 // "last excluding ignored categories"
|
||||
var lastLastExIgnore property = 0 // "last one before that"
|
||||
var regionalIndicatorCount int
|
||||
|
||||
// Rules are usually of the form Cat1 × Cat2; "current" refers to the first property
|
||||
// to the right of the ×, from which we look back or forward
|
||||
|
||||
current, w := lookup(data[pos:])
|
||||
if w == 0 {
|
||||
if !atEOF {
|
||||
// Rune extends past current data, request more
|
||||
return 0, empty, nil
|
||||
}
|
||||
pos = len(data)
|
||||
return pos, data[:pos], nil
|
||||
}
|
||||
|
||||
// https://unicode.org/reports/tr29/#GB1
|
||||
// Start of text always advances
|
||||
pos += w
|
||||
|
||||
for {
|
||||
eot := pos == len(data) // "end of text"
|
||||
|
||||
if eot {
|
||||
if !atEOF {
|
||||
// Token extends past current data, request more
|
||||
return 0, empty, nil
|
||||
}
|
||||
|
||||
// https://unicode.org/reports/tr29/#GB2
|
||||
break
|
||||
}
|
||||
|
||||
/*
|
||||
We've switched the evaluation order of GB1↓ and GB2↑. It's ok:
|
||||
because we've checked for len(data) at the top of this function,
|
||||
sot and eot are mutually exclusive, order doesn't matter.
|
||||
*/
|
||||
|
||||
// Rules are usually of the form Cat1 × Cat2; "current" refers to the first property
|
||||
// to the right of the ×, from which we look back or forward
|
||||
|
||||
// Remember previous properties to avoid lookups/lookbacks
|
||||
last := current
|
||||
if !last.is(_Ignore) {
|
||||
lastLastExIgnore = lastExIgnore
|
||||
lastExIgnore = last
|
||||
}
|
||||
|
||||
current, w = lookup(data[pos:])
|
||||
if w == 0 {
|
||||
if atEOF {
|
||||
// Just return the bytes, we can't do anything with them
|
||||
pos = len(data)
|
||||
break
|
||||
}
|
||||
// Rune extends past current data, request more
|
||||
return 0, empty, nil
|
||||
}
|
||||
|
||||
// Optimization: no rule can possibly apply
|
||||
if current|last == 0 { // i.e. both are zero
|
||||
break
|
||||
}
|
||||
|
||||
// https://unicode.org/reports/tr29/#GB3
|
||||
if current.is(_LF) && last.is(_CR) {
|
||||
pos += w
|
||||
continue
|
||||
}
|
||||
|
||||
// https://unicode.org/reports/tr29/#GB4
|
||||
// https://unicode.org/reports/tr29/#GB5
|
||||
if (current | last).is(_Control | _CR | _LF) {
|
||||
break
|
||||
}
|
||||
|
||||
// https://unicode.org/reports/tr29/#GB6
|
||||
if current.is(_L|_V|_LV|_LVT) && last.is(_L) {
|
||||
pos += w
|
||||
continue
|
||||
}
|
||||
|
||||
// https://unicode.org/reports/tr29/#GB7
|
||||
if current.is(_V|_T) && last.is(_LV|_V) {
|
||||
pos += w
|
||||
continue
|
||||
}
|
||||
|
||||
// https://unicode.org/reports/tr29/#GB8
|
||||
if current.is(_T) && last.is(_LVT|_T) {
|
||||
pos += w
|
||||
continue
|
||||
}
|
||||
|
||||
// https://unicode.org/reports/tr29/#GB9
|
||||
if current.is(_Extend | _ZWJ) {
|
||||
pos += w
|
||||
continue
|
||||
}
|
||||
|
||||
// https://unicode.org/reports/tr29/#GB9a
|
||||
if current.is(_SpacingMark) {
|
||||
pos += w
|
||||
continue
|
||||
}
|
||||
|
||||
// https://unicode.org/reports/tr29/#GB9b
|
||||
if last.is(_Prepend) {
|
||||
pos += w
|
||||
continue
|
||||
}
|
||||
|
||||
// https://unicode.org/reports/tr29/#GB9c
|
||||
// TODO(clipperhouse):
|
||||
// It appears to be added in Unicode 15.1.0:
|
||||
// https://unicode.org/versions/Unicode15.1.0/#Migration
|
||||
// This package currently supports Unicode 15.0.0, so
|
||||
// out of scope for now
|
||||
|
||||
// https://unicode.org/reports/tr29/#GB11
|
||||
if current.is(_ExtendedPictographic) && last.is(_ZWJ) && lastLastExIgnore.is(_ExtendedPictographic) {
|
||||
pos += w
|
||||
continue
|
||||
}
|
||||
|
||||
// https://unicode.org/reports/tr29/#GB12
|
||||
// https://unicode.org/reports/tr29/#GB13
|
||||
if (current & last).is(_RegionalIndicator) {
|
||||
regionalIndicatorCount++
|
||||
|
||||
odd := regionalIndicatorCount%2 == 1
|
||||
if odd {
|
||||
pos += w
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// If we fall through all the above rules, it's a grapheme cluster break
|
||||
break
|
||||
}
|
||||
|
||||
// Return token
|
||||
return pos, data[:pos], nil
|
||||
}
|
||||
1409
vendor/github.com/clipperhouse/uax29/v2/graphemes/trie.go
generated
vendored
Normal file
1409
vendor/github.com/clipperhouse/uax29/v2/graphemes/trie.go
generated
vendored
Normal file
File diff suppressed because it is too large
Load Diff
85
vendor/github.com/clipperhouse/uax29/v2/internal/iterators/iterator.go
generated
vendored
Normal file
85
vendor/github.com/clipperhouse/uax29/v2/internal/iterators/iterator.go
generated
vendored
Normal file
@ -0,0 +1,85 @@
|
||||
package iterators
|
||||
|
||||
type Stringish interface {
|
||||
[]byte | string
|
||||
}
|
||||
|
||||
type SplitFunc[T Stringish] func(T, bool) (int, T, error)
|
||||
|
||||
// Iterator is a generic iterator for words that are either []byte or string.
|
||||
// Iterate while Next() is true, and access the word via Value().
|
||||
type Iterator[T Stringish] struct {
|
||||
split SplitFunc[T]
|
||||
data T
|
||||
start int
|
||||
pos int
|
||||
}
|
||||
|
||||
// New creates a new Iterator for the given data and SplitFunc.
|
||||
func New[T Stringish](split SplitFunc[T], data T) *Iterator[T] {
|
||||
return &Iterator[T]{
|
||||
split: split,
|
||||
data: data,
|
||||
}
|
||||
}
|
||||
|
||||
// SetText sets the text for the iterator to operate on, and resets all state.
|
||||
func (iter *Iterator[T]) SetText(data T) {
|
||||
iter.data = data
|
||||
iter.start = 0
|
||||
iter.pos = 0
|
||||
}
|
||||
|
||||
// Split sets the SplitFunc for the Iterator.
|
||||
func (iter *Iterator[T]) Split(split SplitFunc[T]) {
|
||||
iter.split = split
|
||||
}
|
||||
|
||||
// Next advances the iterator to the next token. It returns false when there
|
||||
// are no remaining tokens or an error occurred.
|
||||
func (iter *Iterator[T]) Next() bool {
|
||||
if iter.pos == len(iter.data) {
|
||||
return false
|
||||
}
|
||||
if iter.pos > len(iter.data) {
|
||||
panic("SplitFunc advanced beyond the end of the data")
|
||||
}
|
||||
|
||||
iter.start = iter.pos
|
||||
|
||||
advance, _, err := iter.split(iter.data[iter.pos:], true)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
if advance <= 0 {
|
||||
panic("SplitFunc returned a zero or negative advance")
|
||||
}
|
||||
|
||||
iter.pos += advance
|
||||
if iter.pos > len(iter.data) {
|
||||
panic("SplitFunc advanced beyond the end of the data")
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
// Value returns the current token.
|
||||
func (iter *Iterator[T]) Value() T {
|
||||
return iter.data[iter.start:iter.pos]
|
||||
}
|
||||
|
||||
// Start returns the byte position of the current token in the original data.
|
||||
func (iter *Iterator[T]) Start() int {
|
||||
return iter.start
|
||||
}
|
||||
|
||||
// End returns the byte position after the current token in the original data.
|
||||
func (iter *Iterator[T]) End() int {
|
||||
return iter.pos
|
||||
}
|
||||
|
||||
// Reset resets the iterator to the beginning of the data.
|
||||
func (iter *Iterator[T]) Reset() {
|
||||
iter.start = 0
|
||||
iter.pos = 0
|
||||
}
|
||||
Reference in New Issue
Block a user