This commit is contained in:
282
vendor/github.com/rivo/uniseg/wordrules.go
generated
vendored
Normal file
282
vendor/github.com/rivo/uniseg/wordrules.go
generated
vendored
Normal file
@ -0,0 +1,282 @@
|
||||
package uniseg
|
||||
|
||||
import "unicode/utf8"
|
||||
|
||||
// The states of the word break parser.
|
||||
const (
|
||||
wbAny = iota
|
||||
wbCR
|
||||
wbLF
|
||||
wbNewline
|
||||
wbWSegSpace
|
||||
wbHebrewLetter
|
||||
wbALetter
|
||||
wbWB7
|
||||
wbWB7c
|
||||
wbNumeric
|
||||
wbWB11
|
||||
wbKatakana
|
||||
wbExtendNumLet
|
||||
wbOddRI
|
||||
wbEvenRI
|
||||
wbZWJBit = 16 // This bit is set for any states followed by at least one zero-width joiner (see WB4 and WB3c).
|
||||
)
|
||||
|
||||
// wbTransitions implements the word break parser's state transitions. It's
|
||||
// anologous to [grTransitions], see comments there for details.
|
||||
//
|
||||
// Unicode version 15.0.0.
|
||||
func wbTransitions(state, prop int) (newState int, wordBreak bool, rule int) {
|
||||
switch uint64(state) | uint64(prop)<<32 {
|
||||
// WB3b.
|
||||
case wbAny | prNewline<<32:
|
||||
return wbNewline, true, 32
|
||||
case wbAny | prCR<<32:
|
||||
return wbCR, true, 32
|
||||
case wbAny | prLF<<32:
|
||||
return wbLF, true, 32
|
||||
|
||||
// WB3a.
|
||||
case wbNewline | prAny<<32:
|
||||
return wbAny, true, 31
|
||||
case wbCR | prAny<<32:
|
||||
return wbAny, true, 31
|
||||
case wbLF | prAny<<32:
|
||||
return wbAny, true, 31
|
||||
|
||||
// WB3.
|
||||
case wbCR | prLF<<32:
|
||||
return wbLF, false, 30
|
||||
|
||||
// WB3d.
|
||||
case wbAny | prWSegSpace<<32:
|
||||
return wbWSegSpace, true, 9990
|
||||
case wbWSegSpace | prWSegSpace<<32:
|
||||
return wbWSegSpace, false, 34
|
||||
|
||||
// WB5.
|
||||
case wbAny | prALetter<<32:
|
||||
return wbALetter, true, 9990
|
||||
case wbAny | prHebrewLetter<<32:
|
||||
return wbHebrewLetter, true, 9990
|
||||
case wbALetter | prALetter<<32:
|
||||
return wbALetter, false, 50
|
||||
case wbALetter | prHebrewLetter<<32:
|
||||
return wbHebrewLetter, false, 50
|
||||
case wbHebrewLetter | prALetter<<32:
|
||||
return wbALetter, false, 50
|
||||
case wbHebrewLetter | prHebrewLetter<<32:
|
||||
return wbHebrewLetter, false, 50
|
||||
|
||||
// WB7. Transitions to wbWB7 handled by transitionWordBreakState().
|
||||
case wbWB7 | prALetter<<32:
|
||||
return wbALetter, false, 70
|
||||
case wbWB7 | prHebrewLetter<<32:
|
||||
return wbHebrewLetter, false, 70
|
||||
|
||||
// WB7a.
|
||||
case wbHebrewLetter | prSingleQuote<<32:
|
||||
return wbAny, false, 71
|
||||
|
||||
// WB7c. Transitions to wbWB7c handled by transitionWordBreakState().
|
||||
case wbWB7c | prHebrewLetter<<32:
|
||||
return wbHebrewLetter, false, 73
|
||||
|
||||
// WB8.
|
||||
case wbAny | prNumeric<<32:
|
||||
return wbNumeric, true, 9990
|
||||
case wbNumeric | prNumeric<<32:
|
||||
return wbNumeric, false, 80
|
||||
|
||||
// WB9.
|
||||
case wbALetter | prNumeric<<32:
|
||||
return wbNumeric, false, 90
|
||||
case wbHebrewLetter | prNumeric<<32:
|
||||
return wbNumeric, false, 90
|
||||
|
||||
// WB10.
|
||||
case wbNumeric | prALetter<<32:
|
||||
return wbALetter, false, 100
|
||||
case wbNumeric | prHebrewLetter<<32:
|
||||
return wbHebrewLetter, false, 100
|
||||
|
||||
// WB11. Transitions to wbWB11 handled by transitionWordBreakState().
|
||||
case wbWB11 | prNumeric<<32:
|
||||
return wbNumeric, false, 110
|
||||
|
||||
// WB13.
|
||||
case wbAny | prKatakana<<32:
|
||||
return wbKatakana, true, 9990
|
||||
case wbKatakana | prKatakana<<32:
|
||||
return wbKatakana, false, 130
|
||||
|
||||
// WB13a.
|
||||
case wbAny | prExtendNumLet<<32:
|
||||
return wbExtendNumLet, true, 9990
|
||||
case wbALetter | prExtendNumLet<<32:
|
||||
return wbExtendNumLet, false, 131
|
||||
case wbHebrewLetter | prExtendNumLet<<32:
|
||||
return wbExtendNumLet, false, 131
|
||||
case wbNumeric | prExtendNumLet<<32:
|
||||
return wbExtendNumLet, false, 131
|
||||
case wbKatakana | prExtendNumLet<<32:
|
||||
return wbExtendNumLet, false, 131
|
||||
case wbExtendNumLet | prExtendNumLet<<32:
|
||||
return wbExtendNumLet, false, 131
|
||||
|
||||
// WB13b.
|
||||
case wbExtendNumLet | prALetter<<32:
|
||||
return wbALetter, false, 132
|
||||
case wbExtendNumLet | prHebrewLetter<<32:
|
||||
return wbHebrewLetter, false, 132
|
||||
case wbExtendNumLet | prNumeric<<32:
|
||||
return wbNumeric, false, 132
|
||||
case wbExtendNumLet | prKatakana<<32:
|
||||
return wbKatakana, false, 132
|
||||
|
||||
default:
|
||||
return -1, false, -1
|
||||
}
|
||||
}
|
||||
|
||||
// transitionWordBreakState determines the new state of the word break parser
|
||||
// given the current state and the next code point. It also returns whether a
|
||||
// word boundary was detected. If more than one code point is needed to
|
||||
// determine the new state, the byte slice or the string starting after rune "r"
|
||||
// can be used (whichever is not nil or empty) for further lookups.
|
||||
func transitionWordBreakState(state int, r rune, b []byte, str string) (newState int, wordBreak bool) {
|
||||
// Determine the property of the next character.
|
||||
nextProperty := property(workBreakCodePoints, r)
|
||||
|
||||
// "Replacing Ignore Rules".
|
||||
if nextProperty == prZWJ {
|
||||
// WB4 (for zero-width joiners).
|
||||
if state == wbNewline || state == wbCR || state == wbLF {
|
||||
return wbAny | wbZWJBit, true // Make sure we don't apply WB4 to WB3a.
|
||||
}
|
||||
if state < 0 {
|
||||
return wbAny | wbZWJBit, false
|
||||
}
|
||||
return state | wbZWJBit, false
|
||||
} else if nextProperty == prExtend || nextProperty == prFormat {
|
||||
// WB4 (for Extend and Format).
|
||||
if state == wbNewline || state == wbCR || state == wbLF {
|
||||
return wbAny, true // Make sure we don't apply WB4 to WB3a.
|
||||
}
|
||||
if state == wbWSegSpace || state == wbAny|wbZWJBit {
|
||||
return wbAny, false // We don't break but this is also not WB3d or WB3c.
|
||||
}
|
||||
if state < 0 {
|
||||
return wbAny, false
|
||||
}
|
||||
return state, false
|
||||
} else if nextProperty == prExtendedPictographic && state >= 0 && state&wbZWJBit != 0 {
|
||||
// WB3c.
|
||||
return wbAny, false
|
||||
}
|
||||
if state >= 0 {
|
||||
state = state &^ wbZWJBit
|
||||
}
|
||||
|
||||
// Find the applicable transition in the table.
|
||||
var rule int
|
||||
newState, wordBreak, rule = wbTransitions(state, nextProperty)
|
||||
if newState < 0 {
|
||||
// No specific transition found. Try the less specific ones.
|
||||
anyPropState, anyPropWordBreak, anyPropRule := wbTransitions(state, prAny)
|
||||
anyStateState, anyStateWordBreak, anyStateRule := wbTransitions(wbAny, nextProperty)
|
||||
if anyPropState >= 0 && anyStateState >= 0 {
|
||||
// Both apply. We'll use a mix (see comments for grTransitions).
|
||||
newState, wordBreak, rule = anyStateState, anyStateWordBreak, anyStateRule
|
||||
if anyPropRule < anyStateRule {
|
||||
wordBreak, rule = anyPropWordBreak, anyPropRule
|
||||
}
|
||||
} else if anyPropState >= 0 {
|
||||
// We only have a specific state.
|
||||
newState, wordBreak, rule = anyPropState, anyPropWordBreak, anyPropRule
|
||||
// This branch will probably never be reached because okAnyState will
|
||||
// always be true given the current transition map. But we keep it here
|
||||
// for future modifications to the transition map where this may not be
|
||||
// true anymore.
|
||||
} else if anyStateState >= 0 {
|
||||
// We only have a specific property.
|
||||
newState, wordBreak, rule = anyStateState, anyStateWordBreak, anyStateRule
|
||||
} else {
|
||||
// No known transition. WB999: Any ÷ Any.
|
||||
newState, wordBreak, rule = wbAny, true, 9990
|
||||
}
|
||||
}
|
||||
|
||||
// For those rules that need to look up runes further in the string, we
|
||||
// determine the property after nextProperty, skipping over Format, Extend,
|
||||
// and ZWJ (according to WB4). It's -1 if not needed, if such a rune cannot
|
||||
// be determined (because the text ends or the rune is faulty).
|
||||
farProperty := -1
|
||||
if rule > 60 &&
|
||||
(state == wbALetter || state == wbHebrewLetter || state == wbNumeric) &&
|
||||
(nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote || // WB6.
|
||||
nextProperty == prDoubleQuote || // WB7b.
|
||||
nextProperty == prMidNum) { // WB12.
|
||||
for {
|
||||
var (
|
||||
r rune
|
||||
length int
|
||||
)
|
||||
if b != nil { // Byte slice version.
|
||||
r, length = utf8.DecodeRune(b)
|
||||
b = b[length:]
|
||||
} else { // String version.
|
||||
r, length = utf8.DecodeRuneInString(str)
|
||||
str = str[length:]
|
||||
}
|
||||
if r == utf8.RuneError {
|
||||
break
|
||||
}
|
||||
prop := property(workBreakCodePoints, r)
|
||||
if prop == prExtend || prop == prFormat || prop == prZWJ {
|
||||
continue
|
||||
}
|
||||
farProperty = prop
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// WB6.
|
||||
if rule > 60 &&
|
||||
(state == wbALetter || state == wbHebrewLetter) &&
|
||||
(nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote) &&
|
||||
(farProperty == prALetter || farProperty == prHebrewLetter) {
|
||||
return wbWB7, false
|
||||
}
|
||||
|
||||
// WB7b.
|
||||
if rule > 72 &&
|
||||
state == wbHebrewLetter &&
|
||||
nextProperty == prDoubleQuote &&
|
||||
farProperty == prHebrewLetter {
|
||||
return wbWB7c, false
|
||||
}
|
||||
|
||||
// WB12.
|
||||
if rule > 120 &&
|
||||
state == wbNumeric &&
|
||||
(nextProperty == prMidNum || nextProperty == prMidNumLet || nextProperty == prSingleQuote) &&
|
||||
farProperty == prNumeric {
|
||||
return wbWB11, false
|
||||
}
|
||||
|
||||
// WB15 and WB16.
|
||||
if newState == wbAny && nextProperty == prRegionalIndicator {
|
||||
if state != wbOddRI && state != wbEvenRI { // Includes state == -1.
|
||||
// Transition into the first RI.
|
||||
return wbOddRI, true
|
||||
}
|
||||
if state == wbOddRI {
|
||||
// Don't break pairs of Regional Indicators.
|
||||
return wbEvenRI, false
|
||||
}
|
||||
return wbOddRI, true // We can break after a pair.
|
||||
}
|
||||
|
||||
return
|
||||
}
|
Reference in New Issue
Block a user