From f7d58cf08bbfe319f7df45156d0d3d2e07624edc Mon Sep 17 00:00:00 2001 From: Case Duckworth Date: Fri, 1 Mar 2024 22:50:34 -0600 Subject: Rewrite read Now works with strings and numbers and does \x...; escapes Does not yet do symbol conversion or newline escapes --- read.lua | 181 ++++++++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 133 insertions(+), 48 deletions(-) diff --git a/read.lua b/read.lua index 43b272a..c89261c 100644 --- a/read.lua +++ b/read.lua @@ -1,73 +1,158 @@ --- lam.read local read = {} +local utf8 = require "utf8" +table.unpack = table.unpack or unpack -local util = require "util" - -function read.tokenize (str) - --[[ Convert a string of characters into a list of tokens ]] - assert(str, "No program given") - local tbl = {} - local word = "" - local push_word = - function () - if word:len() > 0 then - table.insert(tbl, word) - word = "" - end +local string_to_table = + function(str) + local tbl = {} + for p, c in utf8.codes(str) do + table.insert(tbl, c) end + return tbl + end - for c = 1, #str do - char = string.char(str:byte(c)) - if char == " " or char == "\t" or char == "\n" then - push_word() - elseif char == "(" then - push_word() - table.insert(tbl, "(") - elseif char == ")" then - push_word() - table.insert(tbl, ")") - else - word = word .. char - end +local bslash = { -- backslash characters + a = "\a", + b = "\b", + t = "\t", + n = "\n", + r = "\r", + ["\""] = "\"", + ["\\"] = "\\", + ["|"] = "|", + + -- TODO: whitespace + -- \* * : + -- nothing + + x = -- \x; : specified character + function (chars) + local u8ch = {} + repeat + local c = util.pop(chars) + table.insert(u8ch,c) + until c == ";" + table.remove(u8ch) -- remove semicolon + return + utf8.char(tonumber(table.concat(u8ch), 16)), + chars + end, +} + +local consume_string = + function(chars) + local str = {} + repeat + local c = util.pop(chars) + if c == "\\" then + c = util.pop(chars) + if bslash[c] then + if type(bslash[c]) == "function" then + c, chars = bslash[c](chars) + table.insert(str, c) + else + table.insert(str, bslash[c]) + end + else + table.insert(str, "\\"..c) + end + elseif c == "\"" then + break + else + table.insert(str, c) + end + until #chars == 0 + return table.concat(str), chars end - push_word() - return tbl -end -function read.read (str) - -- [[ Read a scheme expression from a string ]] +read.tokenize = + function (program) + if not program or program == "" then return nil end + local tokens = {} + local token = "" + local token_type = nil + + local push_token = + function (type, tok) + type = type or token_type + token = tok or token + if token:len() > 0 then + table.insert(tokens, { + type = type, + value = token, }) + token = "" + token_type = nil + end + end - local function Atom (token) - local n = tonumber(token) - if n then return n - else return tostring(token) + local chars = string_to_table(program) + while #chars > 0 do + local c = util.pop(chars) + if c == "(" then + push_token() + push_token("begin_list", "(") + elseif c == ")" then + push_token() + push_token("end_list", ")") + elseif c:match("%s") then -- whitespace + push_token() + elseif c == "\"" then -- string + str, chars = consume_string(chars) + push_token("string", str) + elseif c:match("%d") then -- numbers + token = token .. c + token_type = token_type or "number" + else + token = token .. c + token_type = token_type or "symbol" + end end + push_token() + return tokens end - local function read_tokens (tokens) - --[[ Read a list of tokens from `tokenize' ]] +read.tokentable = { + string = + function (tok) + return tok.value + end, + number = + function (tok) + return tonumber(tok.value) + end, + symbol = + function (tok) -- TODO need to return a Symbol from types... + return tok.value + end, +} + +read.parse = + function (tokens) assert(next(tokens), "Unexpected EOF") - token = util.pop(tokens) - if token == "(" then + tok = util.pop(tokens) + if tok.value == "(" then local L = {} - while tokens[1] ~= ")" do - table.insert(L, read_tokens(tokens)) + while tokens[1].value ~= ")" do + table.insert(L, read.parse(tokens)) end - util.pop(tokens) -- remove ")" + util.pop(tokens) -- remove ")" return L - elseif token == ")" then + elseif tok.value == ")" then error("Unexpected ')'") + elseif read.tokentable[tok.type] then + return read.tokentable[tok.type](tok) else - return Atom(token) + error("Bad token: '" .. tok.value .. "'") end end - return read_tokens(read.tokenize(str)) -end +read.read = function (program) return read.parse(read.tokenize(program)) end +--- return setmetatable(read, { __call = - function(_, str) - return read.read(str) + function(_, program) + return read.read(program) end, }) -- cgit 1.4.1-21-gabe81