From ff222d305ecc5625a68e5b61d6f03f6201676dd4 Mon Sep 17 00:00:00 2001 From: Case Duckworth Date: Tue, 9 Apr 2024 22:48:12 -0500 Subject: Implement strings --- read.lua | 244 ++++++++++++++++++++++++++++++++++++++++++--------------------- type.lua | 11 ++- 2 files changed, 174 insertions(+), 81 deletions(-) diff --git a/read.lua b/read.lua index 6d55e23..069df57 100644 --- a/read.lua +++ b/read.lua @@ -20,94 +20,180 @@ end ---[[ READ TABLE ]]--- --- each function should take a list of characters and return the token, its --- type, and the rest of the characters -m.readtable = {} -m.readtable.chars = { - ["("] = function (cs) return pop(cs), "open", cs end, - [")"] = function (cs) return pop(cs), "close", cs end, - ["'"] = function (cs) return pop(cs), "quote", cs end, - ["`"] = function (cs) return pop(cs), "quote", cs end, - [","] = -- unquote - function (cs) - pop(cs) -- remove ',' - if cs[1] == "@" then - pop(cs) -- remove '@' - return ",@", "quote", cs - else - return ",", "quote", cs - end - end, - [";"] = -- comment - function (cs) - local comment = {} - while #cs > 0 and not cs[1]:match("\n") do - table.insert(comment, pop(cs)) - end - return table.concat(comment), "comment", cs - end, - ["#"] = -- literal - function (cs) - local tok - -- bail on just '#\' - if not (cs[2] and cs[3]) then - cs = {} - error("bad literal", "#\\") - end +--- helper functions + +local function consume_unquote (cs) + pop(cs) -- remove ',' + if cs[1] == "@" then + pop(cs) -- remove '@' + return ",@", "quote", cs + else + return ",", "quote", cs + end +end + +local function consume_comment (cs) + local comment = {} + while #cs > 0 and not cs[1]:match("\n") do + table.insert(comment, pop(cs)) + end + return table.concat(comment), "comment", cs +end - -- read '#\ ' and such correctly - if cs[2] == "\\" and cs[3]:match(token_separators) then - pop(cs) -- remove '\' - pop(cs) -- remove next character - return type.character(cs[1]) +local function consume_literal (cs) + local tok + -- bail on just '#\' + if not (cs[2] and cs[3]) then + cs = {} + error("bad literal", "#\\") + end + + -- read '#\ ' and such correctly + if cs[2] == "\\" and cs[3]:match(token_separators) then + pop(cs) -- remove '\' + pop(cs) -- remove next character + return type.character(cs[1]) + end + + pop(cs) -- discard '#' ... + tok, cs = consume_token(cs) + tok = "#" .. tok -- ... then put it back + + local val + if m.readtable.literals.lit[tok] then + val = m.readtable.literals.lit[tok] + else + for re, fn in pairs(m.readtable.literals.regex) + do + if tok:match(re) then + val = fn(tok) end + end + end - pop(cs) -- discard '#' ... - tok, cs = consume_token(cs) - tok = "#" .. tok -- ... then put it back + if val == nil then + error("bad literal", tok) + end + return val, "literal", cs +end - local val - if m.readtable.literals.lit[tok] then - val = m.readtable.literals.lit[tok] - else - for re, fn in pairs(m.readtable.literals.regex) - do - if tok:match(re) then - val = fn(tok) - end +local function consume_whitespace (cs) + while #cs > 0 and cs[1]:match("%s") do + pop(cs) + end + return false, nil, cs +end + +local function consume_numbers_etc (cs) + -- Since numbers can start with +, -, and ., those symbols and ... are + -- handled along with numbers. + local tok + tok, cs = consume_token(cs) + if tok:match("^[-+]$") or tok == "..." then + return tok, "symbol", cs + elseif tok == "." then + return tok, "dot", cs + else -- number + local n = tonumber(tok) + if not n then + error("bad number", n) + end + return n, "number", cs + end +end + +-- strings + +local function consume_string_whitespace (cs) + -- \* * : nothing + local s = {"\\"} + while cs[1]:match("[ \t]") do + table.insert(s, pop(cs)) + end + if cs[1] ~= "\n" then + table.insert(s, cs[1]) + return table.concat(s), cs + end + while cs[1]:match("%s") do + pop(cs) + end + return cs[1], cs +end + +local function consume_string_hexvalue (cs) + -- \x; : specified character + local u8ch = {} + repeat + local c = pop(cs) + table.insert(u8ch, c) + until c == ";" + table.remove(u8ch) -- discard ';' + return utf8.char(tonumber(table.concat(u8ch), 16)), cs +end + +local function consume_string (cs) + local str = {} + local escapes = { + a = "\a", + b = "\b", + t = "\t", + n = "\n", + r = "\r", + ["\""] = "\"", + ["\\"] = "\\", + ["|"] = "|", + [" "] = consume_string_whitespace, + ["\t"] = consume_string_whitespace, + ["\n"] = consume_string_whitespace, + x = consume_string_hexvalue, + } + pop(cs) -- discard '"' + repeat + local c = pop(cs) + if c == "\\" then + c = cs[1] + if escapes[c] then + if type.luatype(escapes[c]) == "function" then + c, cs = escapes[c](cs) + table.insert(str, c) + else + table.insert(str, escapes[c]) end + else + table.insert(str, "\\"..c) end + pop(cs) + elseif c == "\"" then + break + else + table.insert(str, c) + end + until #cs == 0 + return type.string(str), "string", cs +end - if val == nil then - error("bad literal", tok) - end - return val, "literal", cs - end, +local function consume_char_as (token_type) + -- return a function that pops a character and returns it with + -- TOKEN_TYPE + return function (cs) return pop(cs), token_type, cs end +end + +-- each function should take a list of characters and return the token, its +-- type, and the rest of the characters +m.readtable = {} +m.readtable.chars = { + ["("] = consume_char_as("open"), + [")"] = consume_char_as("close"), + ["'"] = consume_char_as("quote"), + ["`"] = consume_char_as("quote"), + [","] = consume_unquote, + ["\""] = consume_string, + [";"] = consume_comment, + ["#"] = consume_literal, } m.readtable.regex = { - ["%s"] = -- whitespace - function (cs) - while #cs > 0 and cs[1]:match("%s") do - pop(cs) - end - return false, nil, cs - end, - ["[%d.+-]"] = -- numbers and symbols +, -, ., and ... - function (cs) - local tok - tok, cs = consume_token(cs) - if tok:match("^[-+]$") or tok == "..." then - return tok, "symbol", cs - elseif tok == "." then - return tok, "dot", cs - else -- number - local n = tonumber(tok) - if not n then - error("bad number", n) - end - return n, "number", cs - end - end, + ["%s"] = consume_whitespace, + ["[%d.+-]"] = consume_number_etc, } m.readtable.default = -- default action if nothing else matches function (cs) diff --git a/type.lua b/type.lua index c205468..0000bfb 100644 --- a/type.lua +++ b/type.lua @@ -207,9 +207,16 @@ function m.list (items, final) return tolist(final or m.null, items) end --- strings are vectors of chars +-- strings are vectors of chars. not lam characters, but one-character strings. +-- this is for utf8 ease-of-use... TODO i still need to write functions to pluck +-- out a single lam character from a string, etc. function m.string (x) - local t = tochars(tostring(x)) + local t + if m.luatype(x) == "table" then + t = x + else + t = tochars(tostring(x)) + end local mt = { __type = "string", __tostring = -- cgit 1.4.1-21-gabe81