diff options
-rw-r--r-- | utf8.lua | 224 |
1 files changed, 224 insertions, 0 deletions
diff --git a/utf8.lua b/utf8.lua new file mode 100644 index 0000000..2bfcf8a --- /dev/null +++ b/utf8.lua | |||
@@ -0,0 +1,224 @@ | |||
1 | --- lam.utf8 --- I did not write this code. I did lightly edit it to work with | ||
2 | -- this project. This module was written by GitHub user meepen and is under the | ||
3 | -- CC0 1.0 license. The source can be found here: | ||
4 | -- https://github.com/meepen/Lua-5.1-UTF-8/ | ||
5 | local utf8 = {} | ||
6 | local bit = bit | ||
7 | local error = error | ||
8 | local ipairs = ipairs | ||
9 | local string = string | ||
10 | local table = table | ||
11 | local unpack = table.unpack or unpack | ||
12 | |||
13 | -- Pattern that can be used with the string library to match a single UTF-8 | ||
14 | -- byte-sequence. This expects the string to contain valid UTF-8 data. | ||
15 | utf8.charpattern = "[%z\x01-\x7F\xC2-\xF4][\x80-\xBF]*" | ||
16 | |||
17 | -- Transforms indexes of a string to be positive. Negative indices will wrap | ||
18 | -- around like the string library's functions. | ||
19 | local function strRelToAbs(str, ...) | ||
20 | local args = { ... } | ||
21 | for k, v in ipairs(args) do | ||
22 | v = v > 0 and v or #str + v + 1 | ||
23 | if v < 1 or v > #str then | ||
24 | error("bad index to string (out of range)", 3) | ||
25 | end | ||
26 | args[ k ] = v | ||
27 | end | ||
28 | return unpack(args) | ||
29 | end | ||
30 | |||
31 | -- Decodes a single UTF-8 byte-sequence from a string, ensuring it is valid. | ||
32 | -- Returns the index of the first and last character of the sequence | ||
33 | local function decode(str, startPos) | ||
34 | startPos = strRelToAbs(str, startPos or 1) | ||
35 | local b1 = str:byte(startPos, startPos) | ||
36 | -- Single-byte sequence | ||
37 | if b1 < 0x80 then | ||
38 | return startPos, startPos | ||
39 | end | ||
40 | -- Validate first byte of multi-byte sequence | ||
41 | if b1 > 0xF4 or b1 < 0xC2 then | ||
42 | return nil | ||
43 | end | ||
44 | -- Get 'supposed' amount of continuation bytes from primary byte | ||
45 | local contByteCount = b1 >= 0xF0 and 3 or | ||
46 | b1 >= 0xE0 and 2 or | ||
47 | b1 >= 0xC0 and 1 | ||
48 | local endPos = startPos + contByteCount | ||
49 | -- Validate our continuation bytes | ||
50 | for _, bX in ipairs { str:byte(startPos + 1, endPos) } do | ||
51 | if bit.band(bX, 0xC0) ~= 0x80 then | ||
52 | return nil | ||
53 | end | ||
54 | end | ||
55 | return startPos, endPos | ||
56 | end | ||
57 | |||
58 | -- Takes zero or more integers and returns a string containing the UTF-8 | ||
59 | -- representation of each | ||
60 | function utf8.char(...) | ||
61 | local buf = {} | ||
62 | for k, v in ipairs { ... } do | ||
63 | if v < 0 or v > 0x10FFFF then | ||
64 | error("bad argument #" .. k .. | ||
65 | " to char (out of range)", 2) | ||
66 | end | ||
67 | local b1, b2, b3, b4 = nil, nil, nil, nil | ||
68 | if v < 0x80 then -- Single-byte sequence | ||
69 | table.insert(buf, string.char(v)) | ||
70 | elseif v < 0x800 then -- Two-byte sequence | ||
71 | b1 = bit.bor(0xC0, bit.band(bit.rshift(v, 6), 0x1F)) | ||
72 | b2 = bit.bor(0x80, bit.band(v, 0x3F)) | ||
73 | table.insert(buf, string.char(b1, b2)) | ||
74 | elseif v < 0x10000 then -- Three-byte sequence | ||
75 | b1 = bit.bor(0xE0, bit.band(bit.rshift(v, 12), 0x0F)) | ||
76 | b2 = bit.bor(0x80, bit.band(bit.rshift(v, 6), 0x3F)) | ||
77 | b3 = bit.bor(0x80, bit.band(v, 0x3F)) | ||
78 | table.insert(buf, string.char(b1, b2, b3)) | ||
79 | else -- Four-byte sequence | ||
80 | b1 = bit.bor(0xF0, bit.band(bit.rshift(v, 18), 0x07)) | ||
81 | b2 = bit.bor(0x80, bit.band(bit.rshift(v, 12), 0x3F)) | ||
82 | b3 = bit.bor(0x80, bit.band(bit.rshift(v, 6), 0x3F)) | ||
83 | b4 = bit.bor(0x80, bit.band(v, 0x3F)) | ||
84 | table.insert(buf, string.char(b1, b2, b3, b4)) | ||
85 | end | ||
86 | end | ||
87 | return table.concat(buf, "") | ||
88 | end | ||
89 | |||
90 | -- Iterates over a UTF-8 string similarly to pairs. | ||
91 | -- k = index of sequence, v = string value of sequence | ||
92 | function utf8.codes(str) | ||
93 | local i = 1 | ||
94 | return function() | ||
95 | -- Have we hit the end of the iteration set? | ||
96 | if i > #str then | ||
97 | return nil | ||
98 | end | ||
99 | local startPos, endPos = decode(str, i) | ||
100 | if not startPos then | ||
101 | error("invalid UTF-8 code", 2) | ||
102 | end | ||
103 | i = endPos + 1 | ||
104 | return startPos, str:sub(startPos, endPos) | ||
105 | end | ||
106 | end | ||
107 | |||
108 | -- Returns an integer-representation of the UTF-8 sequence(s) in a string | ||
109 | -- startPos defaults to 1, endPos defaults to startPos | ||
110 | function utf8.codepoint(str, startPos, endPos) | ||
111 | startPos, endPos = strRelToAbs(str, | ||
112 | startPos or 1, | ||
113 | endPos or startPos or 1) | ||
114 | local ret = {} | ||
115 | repeat | ||
116 | local seqStartPos, seqEndPos = decode(str, startPos) | ||
117 | if not seqStartPos then | ||
118 | error("invalid UTF-8 code", 2) | ||
119 | end | ||
120 | -- Increment current string index | ||
121 | startPos = seqEndPos + 1 | ||
122 | -- Amount of bytes making up our sequence | ||
123 | local len = seqEndPos - seqStartPos + 1 | ||
124 | if len == 1 then -- Single-byte codepoint | ||
125 | table.insert(ret, str:byte(seqStartPos)) | ||
126 | else -- Multi-byte codepoint | ||
127 | local b1 = str:byte(seqStartPos) | ||
128 | local cp = 0 | ||
129 | for i = seqStartPos + 1, seqEndPos do | ||
130 | local bX = str:byte(i) | ||
131 | cp = bit.bor(bit.lshift(cp, 6), | ||
132 | bit.band(bX, 0x3F)) | ||
133 | b1 = bit.lshift(b1, 1) | ||
134 | end | ||
135 | cp = bit.bor(cp, bit.lshift(bit.band(b1, 0x7F), | ||
136 | (len - 1) * 5)) | ||
137 | table.insert(ret, cp) | ||
138 | end | ||
139 | until seqEndPos >= endPos | ||
140 | return unpack(ret) | ||
141 | end | ||
142 | |||
143 | -- Returns the length of a UTF-8 string. false, index is returned if an invalid | ||
144 | -- sequence is hit startPos defaults to 1, endPos defaults to -1 | ||
145 | function utf8.len(str, startPos, endPos) | ||
146 | startPos, endPos = strRelToAbs(str, startPos or 1, endPos or -1) | ||
147 | local len = 0 | ||
148 | repeat | ||
149 | local seqStartPos, seqEndPos = decode(str, startPos) | ||
150 | -- Hit an invalid sequence? | ||
151 | if not seqStartPos then | ||
152 | return false, startPos | ||
153 | end | ||
154 | -- Increment current string pointer | ||
155 | startPos = seqEndPos + 1 | ||
156 | -- Increment length | ||
157 | len = len + 1 | ||
158 | until seqEndPos >= endPos | ||
159 | return len | ||
160 | end | ||
161 | |||
162 | -- Returns the byte-index of the n'th UTF-8-character after the given byte-index | ||
163 | -- (nil if none). startPos defaults to 1 when n is positive and -1 when n is | ||
164 | -- negative. If 0 is zero, this function instead returns the byte-index of the | ||
165 | -- UTF-8-character startPos lies within. | ||
166 | function utf8.offset(str, n, startPos) | ||
167 | startPos = strRelToAbs(str, startPos or (n >= 0 and 1) or #str) | ||
168 | -- Find the beginning of the sequence over startPos | ||
169 | if n == 0 then | ||
170 | for i = startPos, 1, -1 do | ||
171 | local seqStartPos, seqEndPos = decode(str, i) | ||
172 | if seqStartPos then | ||
173 | return seqStartPos | ||
174 | end | ||
175 | end | ||
176 | return nil | ||
177 | end | ||
178 | if not decode(str, startPos) then | ||
179 | error("initial position is not beginning of a valid sequence", | ||
180 | 2) | ||
181 | end | ||
182 | local itStart, itEnd, itStep = nil, nil, nil | ||
183 | if n > 0 then -- Find the beginning of the n'th sequence forwards | ||
184 | itStart = startPos | ||
185 | itEnd = #str | ||
186 | itStep = 1 | ||
187 | else -- Find the beginning of the n'th sequence backwards | ||
188 | n = -n | ||
189 | itStart = startPos | ||
190 | itEnd = 1 | ||
191 | itStep = -1 | ||
192 | end | ||
193 | for i = itStart, itEnd, itStep do | ||
194 | local seqStartPos, seqEndPos = decode(str, i) | ||
195 | if seqStartPos then | ||
196 | n = n - 1 | ||
197 | if n == 0 then | ||
198 | return seqStartPos | ||
199 | end | ||
200 | end | ||
201 | end | ||
202 | return nil | ||
203 | end | ||
204 | |||
205 | -- Forces a string to contain only valid UTF-8 data. | ||
206 | -- Invalid sequences are replaced with U+FFFD. | ||
207 | function utf8.force(str) | ||
208 | local buf = {} | ||
209 | local curPos, endPos = 1, #str | ||
210 | repeat | ||
211 | local seqStartPos, seqEndPos = decode(str, curPos) | ||
212 | if not seqStartPos then | ||
213 | table.insert(buf, char(0xFFFD)) | ||
214 | curPos = curPos + 1 | ||
215 | else | ||
216 | table.insert(buf, str:sub(seqStartPos, seqEndPos)) | ||
217 | curPos = seqEndPos + 1 | ||
218 | end | ||
219 | until curPos > endPos | ||
220 | return table.concat(buf, "") | ||
221 | end | ||
222 | |||
223 | --- | ||
224 | return utf8 | ||