Rewrite read

Now works with strings and numbers and does \x...; escapes Does not yet do symbol conversion or newline escapes
author: Case Duckworth 2024-03-01 22:50:34 -0600
committer: Case Duckworth 2024-03-01 22:50:34 -0600
commit: f7d58cf08bbfe319f7df45156d0d3d2e07624edc (patch)
tree: f25653bb96be0c5976ce563c21d1e9257459e7a8
parent: Change types to lowercase; add todos (diff)
download: lam-f7d58cf08bbfe319f7df45156d0d3d2e07624edc.tar.gz
lam-f7d58cf08bbfe319f7df45156d0d3d2e07624edc.zip
1 files changed, 133 insertions, 48 deletions
diff --git a/read.lua b/read.lua
index 43b272a..c89261c 100644
--- a/read.lua
+++ b/read.lua

@@ -1,73 +1,158 @@
 --- lam.read
 local read = {}
+local utf8 = require "utf8"
+table.unpack = table.unpack or unpack
-local util = require "util"
+local string_to_table =
+        function(str)
-function read.tokenize (str)
+                local tbl = {}
-        --[[ Convert a string of characters into a list of tokens ]]
+                for p, c in utf8.codes(str) do
-        assert(str, "No program given")
+                        table.insert(tbl, c)
-        local tbl = {}
-        local word = ""
-        local push_word =
-                function ()
-                        if word:len() > 0 then
-                                table.insert(tbl, word)
-                                word = ""
-                        end
                end
+                return tbl
+        end
-        for c = 1, #str do
+local bslash = { -- backslash characters
-                char = string.char(str:byte(c))
+        a = "\a",
-                if char == " " or char == "\t" or char == "\n" then
+        b = "\b",
-                        push_word()
+        t = "\t",
-                elseif char == "(" then
+        n = "\n",
-                        push_word()
+        r = "\r",
-                        table.insert(tbl, "(")
+        ["\""] = "\"",
-                elseif char == ")" then
+        ["\\"] = "\\",
-                        push_word()
+        ["|"] = "|",
-                        table.insert(tbl, ")")
-                else
+        -- TODO: whitespace
-                        word = word .. char
+        -- \<intraline whitespace>*<line ending> <intraline whitespace>* :
-                end
+        -- nothing
+        x = -- \x<hex scalar value>; : specified character
+                function (chars)
+                        local u8ch = {}
+                        repeat
+                                local c = util.pop(chars)
+                                table.insert(u8ch,c)
+                        until c == ";"
+                        table.remove(u8ch) -- remove semicolon
+                        return
+                                utf8.char(tonumber(table.concat(u8ch), 16)),
+                                chars
+                end,
+}
+local consume_string =
+        function(chars)
+                local str = {}
+                repeat
+                        local c = util.pop(chars)
+                        if c == "\\" then
+                                c = util.pop(chars)
+                                if bslash[c] then
+                                        if type(bslash[c]) == "function" then
+                                                c, chars = bslash[c](chars)
+                                                table.insert(str, c)
+                                        else
+                                                table.insert(str, bslash[c])
+                                        end
+                                else
+                                        table.insert(str, "\\"..c)
+                                end
+                        elseif c == "\"" then
+                                break
+                        else
+                                table.insert(str, c)
+                        end
+                until #chars == 0
+                return table.concat(str), chars
        end
-        push_word()
-        return tbl
-end
-function read.read (str)
+read.tokenize =
-        -- [[ Read a scheme expression from a string ]]
+        function (program)
+                if not program or program == "" then return nil end
+                local tokens = {}
+                local token = ""
+                local token_type = nil
+                local push_token =
+                        function (type, tok)
+                                type = type or token_type
+                                token = tok or token
+                                if token:len() > 0 then
+                                        table.insert(tokens, {
+                                                        type = type,
+                                                        value = token, })
+                                        token = ""
+                                        token_type = nil
+                                end
+                        end
-        local function Atom (token)
+                local chars = string_to_table(program)
-                local n = tonumber(token)
+                while #chars > 0 do
-                if n then return n
+                        local c = util.pop(chars)
-                else return tostring(token)
+                        if c == "(" then
+                                push_token()
+                                push_token("begin_list", "(")
+                        elseif c == ")" then
+                                push_token()
+                                push_token("end_list", ")")
+                        elseif c:match("%s") then -- whitespace
+                                push_token()
+                        elseif c == "\"" then             -- string
+                                str, chars = consume_string(chars)
+                                push_token("string", str)
+                        elseif c:match("%d") then -- numbers
+                                token = token .. c
+                                token_type = token_type or "number"
+                        else
+                                token = token .. c
+                                token_type = token_type or "symbol"
+                        end
                end
+                push_token()
+                return tokens
        end
-        local function read_tokens (tokens)
+read.tokentable = {
-                --[[ Read a list of tokens from `tokenize' ]]
+        string =
+                function (tok)
+                        return tok.value
+                end,
+        number =
+                function (tok)
+                        return tonumber(tok.value)
+                end,
+        symbol =
+                function (tok)  -- TODO need to return a Symbol from types...
+                        return tok.value
+                end,
+}
+read.parse =
+        function (tokens)
                assert(next(tokens), "Unexpected EOF")
-                token = util.pop(tokens)
+                tok = util.pop(tokens)
-                if token == "(" then
+                if tok.value == "(" then
                        local L = {}
-                        while tokens[1] ~= ")" do
+                        while tokens[1].value ~= ")" do
-                                table.insert(L, read_tokens(tokens))
+                                table.insert(L, read.parse(tokens))
                        end
-                        util.pop(tokens)                       --  remove ")"
+                        util.pop(tokens) -- remove ")"
                        return L
-                elseif token == ")" then
+                elseif tok.value == ")" then
                        error("Unexpected ')'")
+                elseif read.tokentable[tok.type] then
+                        return read.tokentable[tok.type](tok)
                else
-                        return Atom(token)
+                        error("Bad token: '" .. tok.value .. "'")
                end
        end
-        return read_tokens(read.tokenize(str))
+read.read = function (program) return read.parse(read.tokenize(program)) end
-end
+---
 return setmetatable(read, { __call =
-                                    function(_, str)
+                                    function(_, program)
-                                            return read.read(str)
+                                            return read.read(program)
                                    end,
 })
author	Case Duckworth	2024-03-01 22:50:34 -0600
committer	Case Duckworth	2024-03-01 22:50:34 -0600
commit	f7d58cf08bbfe319f7df45156d0d3d2e07624edc (patch)
tree	f25653bb96be0c5976ce563c21d1e9257459e7a8
parent	Change types to lowercase; add todos (diff)
download	lam-f7d58cf08bbfe319f7df45156d0d3d2e07624edc.tar.gz lam-f7d58cf08bbfe319f7df45156d0d3d2e07624edc.zip

diff --git a/read.lua b/read.lua index 43b272a..c89261c 100644 --- a/read.lua +++ b/read.lua
@@ -1,73 +1,158 @@
1	--- lam.read	1	--- lam.read
2		2
3	local read = {}	3	local read = {}
		4	local utf8 = require "utf8"
		5	table.unpack = table.unpack or unpack
4		6
5	local util = require "util"	7	local string_to_table =
6		8	function(str)
7	function read.tokenize (str)	9	local tbl = {}
8	--[[ Convert a string of characters into a list of tokens ]]	10	for p, c in utf8.codes(str) do
9	assert(str, "No program given")	11	table.insert(tbl, c)
10	local tbl = {}
11	local word = ""
12	local push_word =
13	function ()
14	if word:len() > 0 then
15	table.insert(tbl, word)
16	word = ""
17	end
18	end	12	end
		13	return tbl
		14	end
19		15
20	for c = 1, #str do	16	local bslash = { -- backslash characters
21	char = string.char(str:byte(c))	17	a = "\a",
22	if char == " " or char == "\t" or char == "\n" then	18	b = "\b",
23	push_word()	19	t = "\t",
24	elseif char == "(" then	20	n = "\n",
25	push_word()	21	r = "\r",
26	table.insert(tbl, "(")	22	["\""] = "\"",
27	elseif char == ")" then	23	["\\"] = "\\",
28	push_word()	24	["\|"] = "\|",
29	table.insert(tbl, ")")	25
30	else	26	-- TODO: whitespace
31	word = word .. char	27	-- \<intraline whitespace><line ending> <intraline whitespace> :
32	end	28	-- nothing
		29
		30	x = -- \x<hex scalar value>; : specified character
		31	function (chars)
		32	local u8ch = {}
		33	repeat
		34	local c = util.pop(chars)
		35	table.insert(u8ch,c)
		36	until c == ";"
		37	table.remove(u8ch) -- remove semicolon
		38	return
		39	utf8.char(tonumber(table.concat(u8ch), 16)),
		40	chars
		41	end,
		42	}
		43
		44	local consume_string =
		45	function(chars)
		46	local str = {}
		47	repeat
		48	local c = util.pop(chars)
		49	if c == "\\" then
		50	c = util.pop(chars)
		51	if bslash[c] then
		52	if type(bslash[c]) == "function" then
		53	c, chars = bslash[c](chars)
		54	table.insert(str, c)
		55	else
		56	table.insert(str, bslash[c])
		57	end
		58	else
		59	table.insert(str, "\\"..c)
		60	end
		61	elseif c == "\"" then
		62	break
		63	else
		64	table.insert(str, c)
		65	end
		66	until #chars == 0
		67	return table.concat(str), chars
33	end	68	end
34	push_word()
35	return tbl
36	end
37		69
38	function read.read (str)	70	read.tokenize =
39	-- [[ Read a scheme expression from a string ]]	71	function (program)
		72	if not program or program == "" then return nil end
		73	local tokens = {}
		74	local token = ""
		75	local token_type = nil
		76
		77	local push_token =
		78	function (type, tok)
		79	type = type or token_type
		80	token = tok or token
		81	if token:len() > 0 then
		82	table.insert(tokens, {
		83	type = type,
		84	value = token, })
		85	token = ""
		86	token_type = nil
		87	end
		88	end
40		89
41	local function Atom (token)	90	local chars = string_to_table(program)
42	local n = tonumber(token)	91	while #chars > 0 do
43	if n then return n	92	local c = util.pop(chars)
44	else return tostring(token)	93	if c == "(" then
		94	push_token()
		95	push_token("begin_list", "(")
		96	elseif c == ")" then
		97	push_token()
		98	push_token("end_list", ")")
		99	elseif c:match("%s") then -- whitespace
		100	push_token()
		101	elseif c == "\"" then -- string
		102	str, chars = consume_string(chars)
		103	push_token("string", str)
		104	elseif c:match("%d") then -- numbers
		105	token = token .. c
		106	token_type = token_type or "number"
		107	else
		108	token = token .. c
		109	token_type = token_type or "symbol"
		110	end
45	end	111	end
		112	push_token()
		113	return tokens
46	end	114	end
47		115
48	local function read_tokens (tokens)	116	read.tokentable = {
49	--[[ Read a list of tokens from `tokenize' ]]	117	string =
		118	function (tok)
		119	return tok.value
		120	end,
		121	number =
		122	function (tok)
		123	return tonumber(tok.value)
		124	end,
		125	symbol =
		126	function (tok) -- TODO need to return a Symbol from types...
		127	return tok.value
		128	end,
		129	}
		130
		131	read.parse =
		132	function (tokens)
50	assert(next(tokens), "Unexpected EOF")	133	assert(next(tokens), "Unexpected EOF")
51	token = util.pop(tokens)	134	tok = util.pop(tokens)
52	if token == "(" then	135	if tok.value == "(" then
53	local L = {}	136	local L = {}
54	while tokens[1] ~= ")" do	137	while tokens[1].value ~= ")" do
55	table.insert(L, read_tokens(tokens))	138	table.insert(L, read.parse(tokens))
56	end	139	end
57	util.pop(tokens) -- remove ")"	140	util.pop(tokens) -- remove ")"
58	return L	141	return L
59	elseif token == ")" then	142	elseif tok.value == ")" then
60	error("Unexpected ')'")	143	error("Unexpected ')'")
		144	elseif read.tokentable[tok.type] then
		145	return read.tokentable[tok.type](tok)
61	else	146	else
62	return Atom(token)	147	error("Bad token: '" .. tok.value .. "'")
63	end	148	end
64	end	149	end
65		150
66	return read_tokens(read.tokenize(str))	151	read.read = function (program) return read.parse(read.tokenize(program)) end
67	end
68		152
		153	---
69	return setmetatable(read, { __call =	154	return setmetatable(read, { __call =
70	function(_, str)	155	function(_, program)
71	return read.read(str)	156	return read.read(program)
72	end,	157	end,
73	})	158	})