Vendor UTF-8

https://github.com/meepen/Lua-5.1-UTF-8
author: Case Duckworth 2024-03-01 22:53:48 -0600
committer: Case Duckworth 2024-03-01 22:53:48 -0600
commit: cdb8a9c957374f691e2cccba384bf05dd843b197 (patch)
tree: 91ed736d1e7e0a9170078a79038844122ab3b5d3
parent: Credit Norvig (diff)
download: lam-cdb8a9c957374f691e2cccba384bf05dd843b197.tar.gz
lam-cdb8a9c957374f691e2cccba384bf05dd843b197.zip
1 files changed, 224 insertions, 0 deletions
diff --git a/utf8.lua b/utf8.lua
new file mode 100644
index 0000000..2bfcf8a
--- /dev/null
+++ b/utf8.lua

@@ -0,0 +1,224 @@
+--- lam.utf8 --- I did not write this code.  I did lightly edit it to work with
+-- this project.  This module was written by GitHub user meepen and is under the
+-- CC0 1.0 license.  The source can be found here:
+-- https://github.com/meepen/Lua-5.1-UTF-8/
+local utf8 = {}
+local bit = bit
+local error = error
+local ipairs = ipairs
+local string = string
+local table = table
+local unpack = table.unpack or unpack
+-- Pattern that can be used with the string library to match a single UTF-8
+-- byte-sequence.  This expects the string to contain valid UTF-8 data.
+utf8.charpattern = "[%z\x01-\x7F\xC2-\xF4][\x80-\xBF]*"
+-- Transforms indexes of a string to be positive.  Negative indices will wrap
+-- around like the string library's functions.
+local function strRelToAbs(str, ...)
+        local args = { ... }
+        for k, v in ipairs(args) do
+                v = v > 0 and v or #str + v + 1
+                if v < 1 or v > #str then
+                        error("bad index to string (out of range)", 3)
+                end
+                args[ k ] = v
+        end
+        return unpack(args)
+end
+-- Decodes a single UTF-8 byte-sequence from a string, ensuring it is valid.
+-- Returns the index of the first and last character of the sequence
+local function decode(str, startPos)
+        startPos = strRelToAbs(str, startPos or 1)
+        local b1 = str:byte(startPos, startPos)
+        -- Single-byte sequence
+        if b1 < 0x80 then
+                return startPos, startPos
+        end
+        -- Validate first byte of multi-byte sequence
+        if b1 > 0xF4 or b1 < 0xC2 then
+                return nil
+        end
+        -- Get 'supposed' amount of continuation bytes from primary byte
+        local contByteCount =   b1 >= 0xF0 and 3 or
+                b1 >= 0xE0 and 2 or
+                b1 >= 0xC0 and 1
+        local endPos = startPos + contByteCount
+        -- Validate our continuation bytes
+        for _, bX in ipairs { str:byte(startPos + 1, endPos) } do
+                if bit.band(bX, 0xC0) ~= 0x80 then
+                        return nil
+                end
+        end
+        return startPos, endPos
+end
+-- Takes zero or more integers and returns a string containing the UTF-8
+-- representation of each
+function utf8.char(...)
+        local buf = {}
+        for k, v in ipairs { ... } do
+                if v < 0 or v > 0x10FFFF then
+                        error("bad argument #" .. k ..
+                               " to char (out of range)", 2)
+                end
+                local b1, b2, b3, b4 = nil, nil, nil, nil
+                if v < 0x80 then -- Single-byte sequence
+                        table.insert(buf, string.char(v))
+                elseif v < 0x800 then -- Two-byte sequence
+                        b1 = bit.bor(0xC0, bit.band(bit.rshift(v, 6), 0x1F))
+                        b2 = bit.bor(0x80, bit.band(v, 0x3F))
+                        table.insert(buf, string.char(b1, b2))
+                elseif v < 0x10000 then -- Three-byte sequence
+                        b1 = bit.bor(0xE0, bit.band(bit.rshift(v, 12), 0x0F))
+                        b2 = bit.bor(0x80, bit.band(bit.rshift(v, 6), 0x3F))
+                        b3 = bit.bor(0x80, bit.band(v, 0x3F))
+                        table.insert(buf, string.char(b1, b2, b3))
+                else -- Four-byte sequence
+                        b1 = bit.bor(0xF0, bit.band(bit.rshift(v, 18), 0x07))
+                        b2 = bit.bor(0x80, bit.band(bit.rshift(v, 12), 0x3F))
+                        b3 = bit.bor(0x80, bit.band(bit.rshift(v, 6), 0x3F))
+                        b4 = bit.bor(0x80, bit.band(v, 0x3F))
+                        table.insert(buf, string.char(b1, b2, b3, b4))
+                end
+        end
+        return table.concat(buf, "")
+end
+-- Iterates over a UTF-8 string similarly to pairs.
+-- k = index of sequence, v = string value of sequence
+function utf8.codes(str)
+        local i = 1
+        return function()
+                -- Have we hit the end of the iteration set?
+                if i > #str then
+                        return nil
+                end
+                local startPos, endPos = decode(str, i)
+                if not startPos then
+                        error("invalid UTF-8 code", 2)
+                end
+                i = endPos + 1
+                return startPos, str:sub(startPos, endPos)
+        end
+end
+-- Returns an integer-representation of the UTF-8 sequence(s) in a string
+-- startPos defaults to 1, endPos defaults to startPos
+function utf8.codepoint(str, startPos, endPos)
+        startPos, endPos = strRelToAbs(str,
+                                       startPos or 1,
+                                       endPos or startPos or 1)
+        local ret = {}
+        repeat
+                local seqStartPos, seqEndPos = decode(str, startPos)
+                if not seqStartPos then
+                        error("invalid UTF-8 code", 2)
+                end
+                -- Increment current string index
+                startPos = seqEndPos + 1
+                -- Amount of bytes making up our sequence
+                local len = seqEndPos - seqStartPos + 1
+                if len == 1 then -- Single-byte codepoint
+                        table.insert(ret, str:byte(seqStartPos))
+                else -- Multi-byte codepoint
+                        local b1 = str:byte(seqStartPos)
+                        local cp = 0
+                        for i = seqStartPos + 1, seqEndPos do
+                                local bX = str:byte(i)
+                                cp = bit.bor(bit.lshift(cp, 6),
+                                             bit.band(bX, 0x3F))
+                                b1 = bit.lshift(b1, 1)
+                        end
+                        cp = bit.bor(cp, bit.lshift(bit.band(b1, 0x7F),
+                                                    (len - 1) * 5))
+                        table.insert(ret, cp)
+                end
+        until seqEndPos >= endPos
+        return unpack(ret)
+end
+-- Returns the length of a UTF-8 string. false, index is returned if an invalid
+-- sequence is hit startPos defaults to 1, endPos defaults to -1
+function utf8.len(str, startPos, endPos)
+        startPos, endPos = strRelToAbs(str, startPos or 1, endPos or -1)
+        local len = 0
+        repeat
+                local seqStartPos, seqEndPos = decode(str, startPos)
+                -- Hit an invalid sequence?
+                if not seqStartPos then
+                        return false, startPos
+                end
+                -- Increment current string pointer
+                startPos = seqEndPos + 1
+                -- Increment length
+                len = len + 1
+        until seqEndPos >= endPos
+        return len
+end
+-- Returns the byte-index of the n'th UTF-8-character after the given byte-index
+-- (nil if none).  startPos defaults to 1 when n is positive and -1 when n is
+-- negative.  If 0 is zero, this function instead returns the byte-index of the
+-- UTF-8-character startPos lies within.
+function utf8.offset(str, n, startPos)
+        startPos = strRelToAbs(str, startPos or (n >= 0 and 1) or #str)
+        -- Find the beginning of the sequence over startPos
+        if n == 0 then
+                for i = startPos, 1, -1 do
+                        local seqStartPos, seqEndPos = decode(str, i)
+                        if seqStartPos then
+                                return seqStartPos
+                        end
+                end
+                return nil
+        end
+        if not decode(str, startPos) then
+                error("initial position is not beginning of a valid sequence",
+                      2)
+        end
+        local itStart, itEnd, itStep = nil, nil, nil
+        if n > 0 then -- Find the beginning of the n'th sequence forwards
+                itStart = startPos
+                itEnd = #str
+                itStep = 1
+        else -- Find the beginning of the n'th sequence backwards
+                n = -n
+                itStart = startPos
+                itEnd = 1
+                itStep = -1
+        end
+        for i = itStart, itEnd, itStep do
+                local seqStartPos, seqEndPos = decode(str, i)
+                if seqStartPos then
+                        n = n - 1
+                        if n == 0 then
+                                return seqStartPos
+                        end
+                end
+        end
+        return nil
+end
+-- Forces a string to contain only valid UTF-8 data.
+-- Invalid sequences are replaced with U+FFFD.
+function utf8.force(str)
+        local buf = {}
+        local curPos, endPos = 1, #str
+        repeat
+                local seqStartPos, seqEndPos = decode(str, curPos)
+                if not seqStartPos then
+                        table.insert(buf, char(0xFFFD))
+                        curPos = curPos + 1
+                else
+                        table.insert(buf, str:sub(seqStartPos, seqEndPos))
+                        curPos = seqEndPos + 1
+                end
+        until curPos > endPos
+        return table.concat(buf, "")
+end
+---
+return utf8
author	Case Duckworth	2024-03-01 22:53:48 -0600
committer	Case Duckworth	2024-03-01 22:53:48 -0600
commit	cdb8a9c957374f691e2cccba384bf05dd843b197 (patch)
tree	91ed736d1e7e0a9170078a79038844122ab3b5d3
parent	Credit Norvig (diff)
download	lam-cdb8a9c957374f691e2cccba384bf05dd843b197.tar.gz lam-cdb8a9c957374f691e2cccba384bf05dd843b197.zip

diff --git a/utf8.lua b/utf8.lua new file mode 100644 index 0000000..2bfcf8a --- /dev/null +++ b/utf8.lua
@@ -0,0 +1,224 @@
	1	--- lam.utf8 --- I did not write this code. I did lightly edit it to work with
	2	-- this project. This module was written by GitHub user meepen and is under the
	3	-- CC0 1.0 license. The source can be found here:
	4	-- https://github.com/meepen/Lua-5.1-UTF-8/
	5	local utf8 = {}
	6	local bit = bit
	7	local error = error
	8	local ipairs = ipairs
	9	local string = string
	10	local table = table
	11	local unpack = table.unpack or unpack
	12
	13	-- Pattern that can be used with the string library to match a single UTF-8
	14	-- byte-sequence. This expects the string to contain valid UTF-8 data.
	15	utf8.charpattern = "[%z\x01-\x7F\xC2-\xF4][\x80-\xBF]*"
	16
	17	-- Transforms indexes of a string to be positive. Negative indices will wrap
	18	-- around like the string library's functions.
	19	local function strRelToAbs(str, ...)
	20	local args = { ... }
	21	for k, v in ipairs(args) do
	22	v = v > 0 and v or #str + v + 1
	23	if v < 1 or v > #str then
	24	error("bad index to string (out of range)", 3)
	25	end
	26	args[ k ] = v
	27	end
	28	return unpack(args)
	29	end
	30
	31	-- Decodes a single UTF-8 byte-sequence from a string, ensuring it is valid.
	32	-- Returns the index of the first and last character of the sequence
	33	local function decode(str, startPos)
	34	startPos = strRelToAbs(str, startPos or 1)
	35	local b1 = str:byte(startPos, startPos)
	36	-- Single-byte sequence
	37	if b1 < 0x80 then
	38	return startPos, startPos
	39	end
	40	-- Validate first byte of multi-byte sequence
	41	if b1 > 0xF4 or b1 < 0xC2 then
	42	return nil
	43	end
	44	-- Get 'supposed' amount of continuation bytes from primary byte
	45	local contByteCount = b1 >= 0xF0 and 3 or
	46	b1 >= 0xE0 and 2 or
	47	b1 >= 0xC0 and 1
	48	local endPos = startPos + contByteCount
	49	-- Validate our continuation bytes
	50	for _, bX in ipairs { str:byte(startPos + 1, endPos) } do
	51	if bit.band(bX, 0xC0) ~= 0x80 then
	52	return nil
	53	end
	54	end
	55	return startPos, endPos
	56	end
	57
	58	-- Takes zero or more integers and returns a string containing the UTF-8
	59	-- representation of each
	60	function utf8.char(...)
	61	local buf = {}
	62	for k, v in ipairs { ... } do
	63	if v < 0 or v > 0x10FFFF then
	64	error("bad argument #" .. k ..
	65	" to char (out of range)", 2)
	66	end
	67	local b1, b2, b3, b4 = nil, nil, nil, nil
	68	if v < 0x80 then -- Single-byte sequence
	69	table.insert(buf, string.char(v))
	70	elseif v < 0x800 then -- Two-byte sequence
	71	b1 = bit.bor(0xC0, bit.band(bit.rshift(v, 6), 0x1F))
	72	b2 = bit.bor(0x80, bit.band(v, 0x3F))
	73	table.insert(buf, string.char(b1, b2))
	74	elseif v < 0x10000 then -- Three-byte sequence
	75	b1 = bit.bor(0xE0, bit.band(bit.rshift(v, 12), 0x0F))
	76	b2 = bit.bor(0x80, bit.band(bit.rshift(v, 6), 0x3F))
	77	b3 = bit.bor(0x80, bit.band(v, 0x3F))
	78	table.insert(buf, string.char(b1, b2, b3))
	79	else -- Four-byte sequence
	80	b1 = bit.bor(0xF0, bit.band(bit.rshift(v, 18), 0x07))
	81	b2 = bit.bor(0x80, bit.band(bit.rshift(v, 12), 0x3F))
	82	b3 = bit.bor(0x80, bit.band(bit.rshift(v, 6), 0x3F))
	83	b4 = bit.bor(0x80, bit.band(v, 0x3F))
	84	table.insert(buf, string.char(b1, b2, b3, b4))
	85	end
	86	end
	87	return table.concat(buf, "")
	88	end
	89
	90	-- Iterates over a UTF-8 string similarly to pairs.
	91	-- k = index of sequence, v = string value of sequence
	92	function utf8.codes(str)
	93	local i = 1
	94	return function()
	95	-- Have we hit the end of the iteration set?
	96	if i > #str then
	97	return nil
	98	end
	99	local startPos, endPos = decode(str, i)
	100	if not startPos then
	101	error("invalid UTF-8 code", 2)
	102	end
	103	i = endPos + 1
	104	return startPos, str:sub(startPos, endPos)
	105	end
	106	end
	107
	108	-- Returns an integer-representation of the UTF-8 sequence(s) in a string
	109	-- startPos defaults to 1, endPos defaults to startPos
	110	function utf8.codepoint(str, startPos, endPos)
	111	startPos, endPos = strRelToAbs(str,
	112	startPos or 1,
	113	endPos or startPos or 1)
	114	local ret = {}
	115	repeat
	116	local seqStartPos, seqEndPos = decode(str, startPos)
	117	if not seqStartPos then
	118	error("invalid UTF-8 code", 2)
	119	end
	120	-- Increment current string index
	121	startPos = seqEndPos + 1
	122	-- Amount of bytes making up our sequence
	123	local len = seqEndPos - seqStartPos + 1
	124	if len == 1 then -- Single-byte codepoint
	125	table.insert(ret, str:byte(seqStartPos))
	126	else -- Multi-byte codepoint
	127	local b1 = str:byte(seqStartPos)
	128	local cp = 0
	129	for i = seqStartPos + 1, seqEndPos do
	130	local bX = str:byte(i)
	131	cp = bit.bor(bit.lshift(cp, 6),
	132	bit.band(bX, 0x3F))
	133	b1 = bit.lshift(b1, 1)
	134	end
	135	cp = bit.bor(cp, bit.lshift(bit.band(b1, 0x7F),
	136	(len - 1) * 5))
	137	table.insert(ret, cp)
	138	end
	139	until seqEndPos >= endPos
	140	return unpack(ret)
	141	end
	142
	143	-- Returns the length of a UTF-8 string. false, index is returned if an invalid
	144	-- sequence is hit startPos defaults to 1, endPos defaults to -1
	145	function utf8.len(str, startPos, endPos)
	146	startPos, endPos = strRelToAbs(str, startPos or 1, endPos or -1)
	147	local len = 0
	148	repeat
	149	local seqStartPos, seqEndPos = decode(str, startPos)
	150	-- Hit an invalid sequence?
	151	if not seqStartPos then
	152	return false, startPos
	153	end
	154	-- Increment current string pointer
	155	startPos = seqEndPos + 1
	156	-- Increment length
	157	len = len + 1
	158	until seqEndPos >= endPos
	159	return len
	160	end
	161
	162	-- Returns the byte-index of the n'th UTF-8-character after the given byte-index
	163	-- (nil if none). startPos defaults to 1 when n is positive and -1 when n is
	164	-- negative. If 0 is zero, this function instead returns the byte-index of the
	165	-- UTF-8-character startPos lies within.
	166	function utf8.offset(str, n, startPos)
	167	startPos = strRelToAbs(str, startPos or (n >= 0 and 1) or #str)
	168	-- Find the beginning of the sequence over startPos
	169	if n == 0 then
	170	for i = startPos, 1, -1 do
	171	local seqStartPos, seqEndPos = decode(str, i)
	172	if seqStartPos then
	173	return seqStartPos
	174	end
	175	end
	176	return nil
	177	end
	178	if not decode(str, startPos) then
	179	error("initial position is not beginning of a valid sequence",
	180	2)
	181	end
	182	local itStart, itEnd, itStep = nil, nil, nil
	183	if n > 0 then -- Find the beginning of the n'th sequence forwards
	184	itStart = startPos
	185	itEnd = #str
	186	itStep = 1
	187	else -- Find the beginning of the n'th sequence backwards
	188	n = -n
	189	itStart = startPos
	190	itEnd = 1
	191	itStep = -1
	192	end
	193	for i = itStart, itEnd, itStep do
	194	local seqStartPos, seqEndPos = decode(str, i)
	195	if seqStartPos then
	196	n = n - 1
	197	if n == 0 then
	198	return seqStartPos
	199	end
	200	end
	201	end
	202	return nil
	203	end
	204
	205	-- Forces a string to contain only valid UTF-8 data.
	206	-- Invalid sequences are replaced with U+FFFD.
	207	function utf8.force(str)
	208	local buf = {}
	209	local curPos, endPos = 1, #str
	210	repeat
	211	local seqStartPos, seqEndPos = decode(str, curPos)
	212	if not seqStartPos then
	213	table.insert(buf, char(0xFFFD))
	214	curPos = curPos + 1
	215	else
	216	table.insert(buf, str:sub(seqStartPos, seqEndPos))
	217	curPos = seqEndPos + 1
	218	end
	219	until curPos > endPos
	220	return table.concat(buf, "")
	221	end
	222
	223	---
	224	return utf8