about summary refs log tree commit diff stats
path: root/utf8.lua
diff options
context:
space:
mode:
authorCase Duckworth2024-03-01 22:53:48 -0600
committerCase Duckworth2024-03-01 22:53:48 -0600
commitcdb8a9c957374f691e2cccba384bf05dd843b197 (patch)
tree91ed736d1e7e0a9170078a79038844122ab3b5d3 /utf8.lua
parentCredit Norvig (diff)
downloadlam-cdb8a9c957374f691e2cccba384bf05dd843b197.tar.gz
lam-cdb8a9c957374f691e2cccba384bf05dd843b197.zip
Vendor UTF-8
https://github.com/meepen/Lua-5.1-UTF-8
Diffstat (limited to 'utf8.lua')
-rw-r--r--utf8.lua224
1 files changed, 224 insertions, 0 deletions
diff --git a/utf8.lua b/utf8.lua new file mode 100644 index 0000000..2bfcf8a --- /dev/null +++ b/utf8.lua
@@ -0,0 +1,224 @@
1--- lam.utf8 --- I did not write this code. I did lightly edit it to work with
2-- this project. This module was written by GitHub user meepen and is under the
3-- CC0 1.0 license. The source can be found here:
4-- https://github.com/meepen/Lua-5.1-UTF-8/
5local utf8 = {}
6local bit = bit
7local error = error
8local ipairs = ipairs
9local string = string
10local table = table
11local unpack = table.unpack or unpack
12
13-- Pattern that can be used with the string library to match a single UTF-8
14-- byte-sequence. This expects the string to contain valid UTF-8 data.
15utf8.charpattern = "[%z\x01-\x7F\xC2-\xF4][\x80-\xBF]*"
16
17-- Transforms indexes of a string to be positive. Negative indices will wrap
18-- around like the string library's functions.
19local function strRelToAbs(str, ...)
20 local args = { ... }
21 for k, v in ipairs(args) do
22 v = v > 0 and v or #str + v + 1
23 if v < 1 or v > #str then
24 error("bad index to string (out of range)", 3)
25 end
26 args[ k ] = v
27 end
28 return unpack(args)
29end
30
31-- Decodes a single UTF-8 byte-sequence from a string, ensuring it is valid.
32-- Returns the index of the first and last character of the sequence
33local function decode(str, startPos)
34 startPos = strRelToAbs(str, startPos or 1)
35 local b1 = str:byte(startPos, startPos)
36 -- Single-byte sequence
37 if b1 < 0x80 then
38 return startPos, startPos
39 end
40 -- Validate first byte of multi-byte sequence
41 if b1 > 0xF4 or b1 < 0xC2 then
42 return nil
43 end
44 -- Get 'supposed' amount of continuation bytes from primary byte
45 local contByteCount = b1 >= 0xF0 and 3 or
46 b1 >= 0xE0 and 2 or
47 b1 >= 0xC0 and 1
48 local endPos = startPos + contByteCount
49 -- Validate our continuation bytes
50 for _, bX in ipairs { str:byte(startPos + 1, endPos) } do
51 if bit.band(bX, 0xC0) ~= 0x80 then
52 return nil
53 end
54 end
55 return startPos, endPos
56end
57
58-- Takes zero or more integers and returns a string containing the UTF-8
59-- representation of each
60function utf8.char(...)
61 local buf = {}
62 for k, v in ipairs { ... } do
63 if v < 0 or v > 0x10FFFF then
64 error("bad argument #" .. k ..
65 " to char (out of range)", 2)
66 end
67 local b1, b2, b3, b4 = nil, nil, nil, nil
68 if v < 0x80 then -- Single-byte sequence
69 table.insert(buf, string.char(v))
70 elseif v < 0x800 then -- Two-byte sequence
71 b1 = bit.bor(0xC0, bit.band(bit.rshift(v, 6), 0x1F))
72 b2 = bit.bor(0x80, bit.band(v, 0x3F))
73 table.insert(buf, string.char(b1, b2))
74 elseif v < 0x10000 then -- Three-byte sequence
75 b1 = bit.bor(0xE0, bit.band(bit.rshift(v, 12), 0x0F))
76 b2 = bit.bor(0x80, bit.band(bit.rshift(v, 6), 0x3F))
77 b3 = bit.bor(0x80, bit.band(v, 0x3F))
78 table.insert(buf, string.char(b1, b2, b3))
79 else -- Four-byte sequence
80 b1 = bit.bor(0xF0, bit.band(bit.rshift(v, 18), 0x07))
81 b2 = bit.bor(0x80, bit.band(bit.rshift(v, 12), 0x3F))
82 b3 = bit.bor(0x80, bit.band(bit.rshift(v, 6), 0x3F))
83 b4 = bit.bor(0x80, bit.band(v, 0x3F))
84 table.insert(buf, string.char(b1, b2, b3, b4))
85 end
86 end
87 return table.concat(buf, "")
88end
89
90-- Iterates over a UTF-8 string similarly to pairs.
91-- k = index of sequence, v = string value of sequence
92function utf8.codes(str)
93 local i = 1
94 return function()
95 -- Have we hit the end of the iteration set?
96 if i > #str then
97 return nil
98 end
99 local startPos, endPos = decode(str, i)
100 if not startPos then
101 error("invalid UTF-8 code", 2)
102 end
103 i = endPos + 1
104 return startPos, str:sub(startPos, endPos)
105 end
106end
107
108-- Returns an integer-representation of the UTF-8 sequence(s) in a string
109-- startPos defaults to 1, endPos defaults to startPos
110function utf8.codepoint(str, startPos, endPos)
111 startPos, endPos = strRelToAbs(str,
112 startPos or 1,
113 endPos or startPos or 1)
114 local ret = {}
115 repeat
116 local seqStartPos, seqEndPos = decode(str, startPos)
117 if not seqStartPos then
118 error("invalid UTF-8 code", 2)
119 end
120 -- Increment current string index
121 startPos = seqEndPos + 1
122 -- Amount of bytes making up our sequence
123 local len = seqEndPos - seqStartPos + 1
124 if len == 1 then -- Single-byte codepoint
125 table.insert(ret, str:byte(seqStartPos))
126 else -- Multi-byte codepoint
127 local b1 = str:byte(seqStartPos)
128 local cp = 0
129 for i = seqStartPos + 1, seqEndPos do
130 local bX = str:byte(i)
131 cp = bit.bor(bit.lshift(cp, 6),
132 bit.band(bX, 0x3F))
133 b1 = bit.lshift(b1, 1)
134 end
135 cp = bit.bor(cp, bit.lshift(bit.band(b1, 0x7F),
136 (len - 1) * 5))
137 table.insert(ret, cp)
138 end
139 until seqEndPos >= endPos
140 return unpack(ret)
141end
142
143-- Returns the length of a UTF-8 string. false, index is returned if an invalid
144-- sequence is hit startPos defaults to 1, endPos defaults to -1
145function utf8.len(str, startPos, endPos)
146 startPos, endPos = strRelToAbs(str, startPos or 1, endPos or -1)
147 local len = 0
148 repeat
149 local seqStartPos, seqEndPos = decode(str, startPos)
150 -- Hit an invalid sequence?
151 if not seqStartPos then
152 return false, startPos
153 end
154 -- Increment current string pointer
155 startPos = seqEndPos + 1
156 -- Increment length
157 len = len + 1
158 until seqEndPos >= endPos
159 return len
160end
161
162-- Returns the byte-index of the n'th UTF-8-character after the given byte-index
163-- (nil if none). startPos defaults to 1 when n is positive and -1 when n is
164-- negative. If 0 is zero, this function instead returns the byte-index of the
165-- UTF-8-character startPos lies within.
166function utf8.offset(str, n, startPos)
167 startPos = strRelToAbs(str, startPos or (n >= 0 and 1) or #str)
168 -- Find the beginning of the sequence over startPos
169 if n == 0 then
170 for i = startPos, 1, -1 do
171 local seqStartPos, seqEndPos = decode(str, i)
172 if seqStartPos then
173 return seqStartPos
174 end
175 end
176 return nil
177 end
178 if not decode(str, startPos) then
179 error("initial position is not beginning of a valid sequence",
180 2)
181 end
182 local itStart, itEnd, itStep = nil, nil, nil
183 if n > 0 then -- Find the beginning of the n'th sequence forwards
184 itStart = startPos
185 itEnd = #str
186 itStep = 1
187 else -- Find the beginning of the n'th sequence backwards
188 n = -n
189 itStart = startPos
190 itEnd = 1
191 itStep = -1
192 end
193 for i = itStart, itEnd, itStep do
194 local seqStartPos, seqEndPos = decode(str, i)
195 if seqStartPos then
196 n = n - 1
197 if n == 0 then
198 return seqStartPos
199 end
200 end
201 end
202 return nil
203end
204
205-- Forces a string to contain only valid UTF-8 data.
206-- Invalid sequences are replaced with U+FFFD.
207function utf8.force(str)
208 local buf = {}
209 local curPos, endPos = 1, #str
210 repeat
211 local seqStartPos, seqEndPos = decode(str, curPos)
212 if not seqStartPos then
213 table.insert(buf, char(0xFFFD))
214 curPos = curPos + 1
215 else
216 table.insert(buf, str:sub(seqStartPos, seqEndPos))
217 curPos = seqEndPos + 1
218 end
219 until curPos > endPos
220 return table.concat(buf, "")
221end
222
223---
224return utf8