chore: go mod tidy / vendor / make deps

This commit is contained in:
2025-10-02 08:25:31 +02:00
parent 1c10e64c58
commit d63a1c28ea
505 changed files with 34448 additions and 35285 deletions

21
vendor/github.com/clipperhouse/uax29/v2/LICENSE generated vendored Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2020 Matt Sherman
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -0,0 +1,82 @@
An implementation of grapheme cluster boundaries from [Unicode text segmentation](https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries) (UAX 29), for Unicode version 15.0.0.
## Quick start
```
go get "github.com/clipperhouse/uax29/v2/graphemes"
```
```go
import "github.com/clipperhouse/uax29/v2/graphemes"
text := "Hello, 世界. Nice dog! 👍🐶"
tokens := graphemes.FromString(text)
for tokens.Next() { // Next() returns true until end of data
fmt.Println(tokens.Value()) // Do something with the current grapheme
}
```
[![Documentation](https://pkg.go.dev/badge/github.com/clipperhouse/uax29/v2/graphemes.svg)](https://pkg.go.dev/github.com/clipperhouse/uax29/v2/graphemes)
_A grapheme is a “single visible character”, which might be a simple as a single letter, or a complex emoji that consists of several Unicode code points._
## Conformance
We use the Unicode [test suite](https://unicode.org/reports/tr41/tr41-26.html#Tests29). Status:
![Go](https://github.com/clipperhouse/uax29/actions/workflows/gotest.yml/badge.svg)
## APIs
### If you have a `string`
```go
text := "Hello, 世界. Nice dog! 👍🐶"
tokens := graphemes.FromString(text)
for tokens.Next() { // Next() returns true until end of data
fmt.Println(tokens.Value()) // Do something with the current grapheme
}
```
### If you have an `io.Reader`
`FromReader` embeds a [`bufio.Scanner`](https://pkg.go.dev/bufio#Scanner), so just use those methods.
```go
r := getYourReader() // from a file or network maybe
tokens := graphemes.FromReader(r)
for tokens.Scan() { // Scan() returns true until error or EOF
fmt.Println(tokens.Text()) // Do something with the current grapheme
}
if tokens.Err() != nil { // Check the error
log.Fatal(tokens.Err())
}
```
### If you have a `[]byte`
```go
b := []byte("Hello, 世界. Nice dog! 👍🐶")
tokens := graphemes.FromBytes(b)
for tokens.Next() { // Next() returns true until end of data
fmt.Println(tokens.Value()) // Do something with the current grapheme
}
```
### Performance
On a Mac M2 laptop, we see around 200MB/s, or around 100 million graphemes per second. You should see ~constant memory, and no allocations.
### Invalid inputs
Invalid UTF-8 input is considered undefined behavior. We test to ensure that bad inputs will not cause pathological outcomes, such as a panic or infinite loop. Callers should expect “garbage-in, garbage-out”.
Your pipeline should probably include a call to [`utf8.Valid()`](https://pkg.go.dev/unicode/utf8#Valid).

View File

@ -0,0 +1,28 @@
package graphemes
import "github.com/clipperhouse/uax29/v2/internal/iterators"
type Iterator[T iterators.Stringish] struct {
*iterators.Iterator[T]
}
var (
splitFuncString = splitFunc[string]
splitFuncBytes = splitFunc[[]byte]
)
// FromString returns an iterator for the grapheme clusters in the input string.
// Iterate while Next() is true, and access the grapheme via Value().
func FromString(s string) Iterator[string] {
return Iterator[string]{
iterators.New(splitFuncString, s),
}
}
// FromBytes returns an iterator for the grapheme clusters in the input bytes.
// Iterate while Next() is true, and access the grapheme via Value().
func FromBytes(b []byte) Iterator[[]byte] {
return Iterator[[]byte]{
iterators.New(splitFuncBytes, b),
}
}

View File

@ -0,0 +1,25 @@
// Package graphemes implements Unicode grapheme cluster boundaries: https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
package graphemes
import (
"bufio"
"io"
)
type Scanner struct {
*bufio.Scanner
}
// FromReader returns a Scanner, to split graphemes per
// https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries.
//
// It embeds a [bufio.Scanner], so you can use its methods.
//
// Iterate through graphemes by calling Scan() until false, then check Err().
func FromReader(r io.Reader) *Scanner {
sc := bufio.NewScanner(r)
sc.Split(SplitFunc)
return &Scanner{
Scanner: sc,
}
}

View File

@ -0,0 +1,174 @@
package graphemes
import (
"bufio"
"github.com/clipperhouse/uax29/v2/internal/iterators"
)
// is determines if lookup intersects propert(ies)
func (lookup property) is(properties property) bool {
return (lookup & properties) != 0
}
const _Ignore = _Extend
// SplitFunc is a bufio.SplitFunc implementation of Unicode grapheme cluster segmentation, for use with bufio.Scanner.
//
// See https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries.
var SplitFunc bufio.SplitFunc = splitFunc[[]byte]
func splitFunc[T iterators.Stringish](data T, atEOF bool) (advance int, token T, err error) {
var empty T
if len(data) == 0 {
return 0, empty, nil
}
// These vars are stateful across loop iterations
var pos int
var lastExIgnore property = 0 // "last excluding ignored categories"
var lastLastExIgnore property = 0 // "last one before that"
var regionalIndicatorCount int
// Rules are usually of the form Cat1 × Cat2; "current" refers to the first property
// to the right of the ×, from which we look back or forward
current, w := lookup(data[pos:])
if w == 0 {
if !atEOF {
// Rune extends past current data, request more
return 0, empty, nil
}
pos = len(data)
return pos, data[:pos], nil
}
// https://unicode.org/reports/tr29/#GB1
// Start of text always advances
pos += w
for {
eot := pos == len(data) // "end of text"
if eot {
if !atEOF {
// Token extends past current data, request more
return 0, empty, nil
}
// https://unicode.org/reports/tr29/#GB2
break
}
/*
We've switched the evaluation order of GB1↓ and GB2↑. It's ok:
because we've checked for len(data) at the top of this function,
sot and eot are mutually exclusive, order doesn't matter.
*/
// Rules are usually of the form Cat1 × Cat2; "current" refers to the first property
// to the right of the ×, from which we look back or forward
// Remember previous properties to avoid lookups/lookbacks
last := current
if !last.is(_Ignore) {
lastLastExIgnore = lastExIgnore
lastExIgnore = last
}
current, w = lookup(data[pos:])
if w == 0 {
if atEOF {
// Just return the bytes, we can't do anything with them
pos = len(data)
break
}
// Rune extends past current data, request more
return 0, empty, nil
}
// Optimization: no rule can possibly apply
if current|last == 0 { // i.e. both are zero
break
}
// https://unicode.org/reports/tr29/#GB3
if current.is(_LF) && last.is(_CR) {
pos += w
continue
}
// https://unicode.org/reports/tr29/#GB4
// https://unicode.org/reports/tr29/#GB5
if (current | last).is(_Control | _CR | _LF) {
break
}
// https://unicode.org/reports/tr29/#GB6
if current.is(_L|_V|_LV|_LVT) && last.is(_L) {
pos += w
continue
}
// https://unicode.org/reports/tr29/#GB7
if current.is(_V|_T) && last.is(_LV|_V) {
pos += w
continue
}
// https://unicode.org/reports/tr29/#GB8
if current.is(_T) && last.is(_LVT|_T) {
pos += w
continue
}
// https://unicode.org/reports/tr29/#GB9
if current.is(_Extend | _ZWJ) {
pos += w
continue
}
// https://unicode.org/reports/tr29/#GB9a
if current.is(_SpacingMark) {
pos += w
continue
}
// https://unicode.org/reports/tr29/#GB9b
if last.is(_Prepend) {
pos += w
continue
}
// https://unicode.org/reports/tr29/#GB9c
// TODO(clipperhouse):
// It appears to be added in Unicode 15.1.0:
// https://unicode.org/versions/Unicode15.1.0/#Migration
// This package currently supports Unicode 15.0.0, so
// out of scope for now
// https://unicode.org/reports/tr29/#GB11
if current.is(_ExtendedPictographic) && last.is(_ZWJ) && lastLastExIgnore.is(_ExtendedPictographic) {
pos += w
continue
}
// https://unicode.org/reports/tr29/#GB12
// https://unicode.org/reports/tr29/#GB13
if (current & last).is(_RegionalIndicator) {
regionalIndicatorCount++
odd := regionalIndicatorCount%2 == 1
if odd {
pos += w
continue
}
}
// If we fall through all the above rules, it's a grapheme cluster break
break
}
// Return token
return pos, data[:pos], nil
}

1409
vendor/github.com/clipperhouse/uax29/v2/graphemes/trie.go generated vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,85 @@
package iterators
type Stringish interface {
[]byte | string
}
type SplitFunc[T Stringish] func(T, bool) (int, T, error)
// Iterator is a generic iterator for words that are either []byte or string.
// Iterate while Next() is true, and access the word via Value().
type Iterator[T Stringish] struct {
split SplitFunc[T]
data T
start int
pos int
}
// New creates a new Iterator for the given data and SplitFunc.
func New[T Stringish](split SplitFunc[T], data T) *Iterator[T] {
return &Iterator[T]{
split: split,
data: data,
}
}
// SetText sets the text for the iterator to operate on, and resets all state.
func (iter *Iterator[T]) SetText(data T) {
iter.data = data
iter.start = 0
iter.pos = 0
}
// Split sets the SplitFunc for the Iterator.
func (iter *Iterator[T]) Split(split SplitFunc[T]) {
iter.split = split
}
// Next advances the iterator to the next token. It returns false when there
// are no remaining tokens or an error occurred.
func (iter *Iterator[T]) Next() bool {
if iter.pos == len(iter.data) {
return false
}
if iter.pos > len(iter.data) {
panic("SplitFunc advanced beyond the end of the data")
}
iter.start = iter.pos
advance, _, err := iter.split(iter.data[iter.pos:], true)
if err != nil {
panic(err)
}
if advance <= 0 {
panic("SplitFunc returned a zero or negative advance")
}
iter.pos += advance
if iter.pos > len(iter.data) {
panic("SplitFunc advanced beyond the end of the data")
}
return true
}
// Value returns the current token.
func (iter *Iterator[T]) Value() T {
return iter.data[iter.start:iter.pos]
}
// Start returns the byte position of the current token in the original data.
func (iter *Iterator[T]) Start() int {
return iter.start
}
// End returns the byte position after the current token in the original data.
func (iter *Iterator[T]) End() int {
return iter.pos
}
// Reset resets the iterator to the beginning of the data.
func (iter *Iterator[T]) Reset() {
iter.start = 0
iter.pos = 0
}