forked from toolshed/abra
211 lines
5.3 KiB
Go
211 lines
5.3 KiB
Go
package displaywidth
|
||
|
||
import (
|
||
"unicode/utf8"
|
||
|
||
"github.com/clipperhouse/stringish"
|
||
"github.com/clipperhouse/uax29/v2/graphemes"
|
||
)
|
||
|
||
// String calculates the display width of a string,
|
||
// by iterating over grapheme clusters in the string
|
||
// and summing their widths.
|
||
func String(s string) int {
|
||
return DefaultOptions.String(s)
|
||
}
|
||
|
||
// Bytes calculates the display width of a []byte,
|
||
// by iterating over grapheme clusters in the byte slice
|
||
// and summing their widths.
|
||
func Bytes(s []byte) int {
|
||
return DefaultOptions.Bytes(s)
|
||
}
|
||
|
||
// Rune calculates the display width of a rune. You
|
||
// should almost certainly use [String] or [Bytes] for
|
||
// most purposes.
|
||
//
|
||
// The smallest unit of display width is a grapheme
|
||
// cluster, not a rune. Iterating over runes to measure
|
||
// width is incorrect in most cases.
|
||
func Rune(r rune) int {
|
||
return DefaultOptions.Rune(r)
|
||
}
|
||
|
||
// Options allows you to specify the treatment of ambiguous East Asian
|
||
// characters. When EastAsianWidth is false (default), ambiguous East Asian
|
||
// characters are treated as width 1. When EastAsianWidth is true, ambiguous
|
||
// East Asian characters are treated as width 2.
|
||
type Options struct {
|
||
EastAsianWidth bool
|
||
}
|
||
|
||
// DefaultOptions is the default options for the display width
|
||
// calculation, which is EastAsianWidth: false.
|
||
var DefaultOptions = Options{EastAsianWidth: false}
|
||
|
||
// String calculates the display width of a string,
|
||
// for the given options, by iterating over grapheme clusters
|
||
// and summing their widths.
|
||
func (options Options) String(s string) int {
|
||
if len(s) == 0 {
|
||
return 0
|
||
}
|
||
|
||
total := 0
|
||
g := graphemes.FromString(s)
|
||
for g.Next() {
|
||
props := lookupProperties(g.Value())
|
||
total += props.width(options)
|
||
}
|
||
return total
|
||
}
|
||
|
||
// Bytes calculates the display width of a []byte,
|
||
// for the given options, by iterating over grapheme
|
||
// clusters in the byte slice and summing their widths.
|
||
func (options Options) Bytes(s []byte) int {
|
||
if len(s) == 0 {
|
||
return 0
|
||
}
|
||
|
||
total := 0
|
||
g := graphemes.FromBytes(s)
|
||
for g.Next() {
|
||
props := lookupProperties(g.Value())
|
||
total += props.width(options)
|
||
}
|
||
return total
|
||
}
|
||
|
||
// Rune calculates the display width of a rune,
|
||
// for the given options.
|
||
//
|
||
// The smallest unit of display width is a grapheme
|
||
// cluster, not a rune. Iterating over runes to measure
|
||
// width is incorrect in most cases.
|
||
func (options Options) Rune(r rune) int {
|
||
// Fast path for ASCII
|
||
if r < utf8.RuneSelf {
|
||
if isASCIIControl(byte(r)) {
|
||
// Control (0x00-0x1F) and DEL (0x7F)
|
||
return 0
|
||
}
|
||
// ASCII printable (0x20-0x7E)
|
||
return 1
|
||
}
|
||
|
||
// Surrogates (U+D800-U+DFFF) are invalid UTF-8 and have zero width
|
||
// Other packages might turn them into the replacement character (U+FFFD)
|
||
// in which case, we won't see it.
|
||
if r >= 0xD800 && r <= 0xDFFF {
|
||
return 0
|
||
}
|
||
|
||
// Stack-allocated to avoid heap allocation
|
||
var buf [4]byte // UTF-8 is at most 4 bytes
|
||
n := utf8.EncodeRune(buf[:], r)
|
||
// Skip the grapheme iterator and directly lookup properties
|
||
props := lookupProperties(buf[:n])
|
||
return props.width(options)
|
||
}
|
||
|
||
func isASCIIControl(b byte) bool {
|
||
return b < 0x20 || b == 0x7F
|
||
}
|
||
|
||
// isRIPrefix checks if the slice matches the Regional Indicator prefix
|
||
// (F0 9F 87). It assumes len(s) >= 3.
|
||
func isRIPrefix[T stringish.Interface](s T) bool {
|
||
return s[0] == 0xF0 && s[1] == 0x9F && s[2] == 0x87
|
||
}
|
||
|
||
// isVS16 checks if the slice matches VS16 (U+FE0F) UTF-8 encoding
|
||
// (EF B8 8F). It assumes len(s) >= 3.
|
||
func isVS16[T stringish.Interface](s T) bool {
|
||
return s[0] == 0xEF && s[1] == 0xB8 && s[2] == 0x8F
|
||
}
|
||
|
||
// lookupProperties returns the properties for the first character in a string
|
||
func lookupProperties[T stringish.Interface](s T) property {
|
||
l := len(s)
|
||
|
||
if l == 0 {
|
||
return 0
|
||
}
|
||
|
||
b := s[0]
|
||
if isASCIIControl(b) {
|
||
return _Zero_Width
|
||
}
|
||
|
||
if b < utf8.RuneSelf {
|
||
// Check for variation selector after ASCII (e.g., keycap sequences like 1️⃣)
|
||
if l >= 4 {
|
||
// Subslice may help eliminate bounds checks
|
||
vs := s[1:4]
|
||
if isVS16(vs) {
|
||
// VS16 requests emoji presentation (width 2)
|
||
return _Emoji
|
||
}
|
||
// VS15 (0x8E) requests text presentation but does not affect width,
|
||
// in my reading of Unicode TR51. Falls through to _Default.
|
||
}
|
||
return _Default
|
||
}
|
||
|
||
// Regional indicator pair (flag)
|
||
if l >= 8 {
|
||
// Subslice may help eliminate bounds checks
|
||
ri := s[:8]
|
||
if isRIPrefix(ri[0:3]) {
|
||
b3 := ri[3]
|
||
if b3 >= 0xA6 && b3 <= 0xBF && isRIPrefix(ri[4:7]) {
|
||
b7 := ri[7]
|
||
if b7 >= 0xA6 && b7 <= 0xBF {
|
||
return _Emoji
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
props, size := lookup(s)
|
||
p := property(props)
|
||
|
||
// Variation Selectors
|
||
if size > 0 && l >= size+3 {
|
||
// Subslice may help eliminate bounds checks
|
||
vs := s[size : size+3]
|
||
if isVS16(vs) {
|
||
// VS16 requests emoji presentation (width 2)
|
||
return _Emoji
|
||
}
|
||
// VS15 (0x8E) requests text presentation but does not affect width,
|
||
// in my reading of Unicode TR51. Falls through to return the base
|
||
// character's property (p).
|
||
}
|
||
|
||
return p
|
||
}
|
||
|
||
const _Default property = 0
|
||
|
||
// a jump table of sorts, instead of a switch
|
||
var widthTable = [5]int{
|
||
_Default: 1,
|
||
_Zero_Width: 0,
|
||
_East_Asian_Wide: 2,
|
||
_East_Asian_Ambiguous: 1,
|
||
_Emoji: 2,
|
||
}
|
||
|
||
// width determines the display width of a character based on its properties
|
||
// and configuration options
|
||
func (p property) width(options Options) int {
|
||
if options.EastAsianWidth && p == _East_Asian_Ambiguous {
|
||
return 2
|
||
}
|
||
|
||
return widthTable[p]
|
||
}
|