Files
shed/src/parse/lex.rs

529 lines
12 KiB
Rust

use std::{fmt::Display, ops::{Bound, Deref, Range, RangeBounds}};
use bitflags::bitflags;
use crate::{builtin::BUILTINS, prelude::*};
pub const KEYWORDS: [&'static str;14] = [
"if",
"then",
"elif",
"else",
"fi",
"while",
"until",
"select",
"for",
"in",
"do",
"done",
"case",
"esac",
];
pub const OPENERS: [&'static str;6] = [
"if",
"while",
"until",
"for",
"select",
"case"
];
#[derive(Clone,PartialEq,Default,Debug)]
pub struct Span<'s> {
range: Range<usize>,
source: &'s str
}
impl<'s> Span<'s> {
/// New `Span`. Wraps a range and a string slice that it refers to.
pub fn new(range: Range<usize>, source: &'s str) -> Self {
Span {
range,
source,
}
}
/// Slice the source string at the wrapped range
pub fn as_str(&self) -> &str {
&self.source[self.start..self.end]
}
pub fn get_source(&'s self) -> &'s str {
self.source
}
pub fn range(&self) -> Range<usize> {
self.range.clone()
}
}
/// Allows simple access to the underlying range wrapped by the span
impl<'s> Deref for Span<'s> {
type Target = Range<usize>;
fn deref(&self) -> &Self::Target {
&self.range
}
}
#[derive(Clone,PartialEq,Debug)]
pub enum TkRule {
Null,
SOI, // Start-of-Input
Str,
Pipe,
ErrPipe,
And,
Or,
Bg,
Sep,
Redir,
Expanded { exp: Vec<String> },
Comment,
EOI, // End-of-Input
}
impl Default for TkRule {
fn default() -> Self {
TkRule::Null
}
}
#[derive(Clone,Copy,PartialEq,Debug)]
pub enum TkErr {
Null,
UntermQuote,
UntermSubsh,
UntermEscape,
UntermBrace,
BadRedir,
BadPipe,
HangingDelim,
}
impl Default for TkErr {
fn default() -> Self {
TkErr::Null
}
}
pub enum TkState {
Raw,
}
#[derive(Clone,Debug,PartialEq,Default)]
pub struct Tk<'s> {
pub class: TkRule,
pub err_span: Option<Span<'s>>,
pub err: TkErr,
pub span: Span<'s>,
pub flags: TkFlags
}
// There's one impl here and then another in expand.rs which has the expansion logic
impl<'s> Tk<'s> {
pub fn new(class: TkRule, span: Span<'s>) -> Self {
Self { class, err_span: None, err: TkErr::Null, span, flags: TkFlags::empty() }
}
pub fn to_string(&self) -> String {
match &self.class {
TkRule::Expanded { exp } => exp.join(" "),
_ => self.span.as_str().to_string()
}
}
pub fn set_err(&mut self, range: Range<usize>, slice: &'s str, err: TkErr) {
self.err_span = Some(Span::new(range, slice));
self.err = err
}
pub fn is_err(&self) -> bool {
self.err_span.is_some()
}
pub fn source(&self) -> &'s str {
self.span.source
}
}
impl<'s> Display for Tk<'s> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match &self.class {
TkRule::Expanded { exp } => write!(f,"{}",exp.join(" ")),
_ => write!(f,"{}",self.span.as_str())
}
}
}
bitflags! {
#[derive(Debug,Clone,Copy,PartialEq,Default)]
pub struct TkFlags: u32 {
const KEYWORD = 0b0000000000000001;
/// This is a keyword that opens a new block statement, like 'if' and 'while'
const OPENER = 0b0000000000000010;
const IS_CMD = 0b0000000000000100;
const IS_OP = 0b0000000000001000;
const ASSIGN = 0b0000000000010000;
const BUILTIN = 0b0000000000100000;
}
}
pub struct LexStream<'t> {
source: &'t str,
pub cursor: usize,
in_quote: bool,
flags: LexFlags,
}
bitflags! {
pub struct LexFlags: u32 {
/// Return comment tokens
const LEX_COMMENTS = 0b00000001;
/// Allow unfinished input
const LEX_UNFINISHED = 0b00000010;
/// The next string-type token is a command name
const NEXT_IS_CMD = 0b00000100;
/// We are in a quotation, so quoting rules apply
const IN_QUOTE = 0b00001000;
/// Only lex strings; used in expansions
const RAW = 0b00010000;
/// The lexer has not produced any tokens yet
const FRESH = 0b00010000;
/// The lexer has no more tokens to produce
const STALE = 0b00100000;
}
}
impl<'t> LexStream<'t> {
pub fn new(source: &'t str, flags: LexFlags) -> Self {
flog!(TRACE, "new lex stream");
let flags = flags | LexFlags::FRESH | LexFlags::NEXT_IS_CMD;
Self { source, cursor: 0, in_quote: false, flags }
}
/// Returns a slice of the source input using the given range
/// Returns None if the range is out of the bounds of the string slice
///
/// Works with any kind of range
/// examples:
/// `LexStream.slice(1..10)`
/// `LexStream.slice(1..=10)`
/// `LexStream.slice(..10)`
/// `LexStream.slice(1..)`
///
pub fn slice<R: RangeBounds<usize>>(&self, range: R) -> Option<&'t str> {
// Sketchy downcast
let start = match range.start_bound() {
Bound::Included(&start) => start,
Bound::Excluded(&start) => start + 1,
Bound::Unbounded => 0
};
let end = match range.end_bound() {
Bound::Included(&end) => end,
Bound::Excluded(&end) => end + 1,
Bound::Unbounded => self.source.len()
};
self.source.get(start..end)
}
pub fn slice_from_cursor(&self) -> Option<&'t str> {
self.slice(self.cursor..)
}
/// The next string token is a command name
pub fn next_is_cmd(&mut self) {
self.flags |= LexFlags::NEXT_IS_CMD;
}
/// The next string token is not a command name
pub fn next_is_not_cmd(&mut self) {
self.flags &= !LexFlags::NEXT_IS_CMD;
}
pub fn read_redir(&mut self) -> Option<Tk<'t>> {
assert!(self.cursor <= self.source.len());
let slice = self.slice(self.cursor..)?;
let mut pos = self.cursor;
let mut chars = slice.chars().peekable();
let mut tk = Tk::default();
while let Some(ch) = chars.next() {
match ch {
'>' => {
pos += 1;
if let Some('>') = chars.peek() {
chars.next();
pos += 1;
}
if let Some('&') = chars.peek() {
chars.next();
pos += 1;
let mut found_fd = false;
while chars.peek().is_some_and(|ch| ch.is_ascii_digit()) {
chars.next();
found_fd = true;
pos += 1;
}
if !found_fd {
let err = TkErr::BadRedir;
tk = self.get_token(self.cursor..pos, TkRule::Redir);
tk.set_err(self.cursor..pos, self.source, err);
break
} else {
tk = self.get_token(self.cursor..pos, TkRule::Redir);
break
}
} else {
tk = self.get_token(self.cursor..pos, TkRule::Redir);
break
}
}
'<' => {
pos += 1;
for _ in 0..2 {
if let Some('<') = chars.peek() {
chars.next();
pos += 1;
} else {
break
}
}
tk = self.get_token(self.cursor..pos, TkRule::Redir);
break
}
'0'..='9' => {
pos += 1;
while chars.peek().is_some_and(|ch| ch.is_ascii_digit()) {
chars.next();
pos += 1;
}
}
_ => {
return None;
}
}
}
assert!(tk != Tk::default());
self.cursor = pos;
Some(tk)
}
pub fn read_string(&mut self) -> Tk<'t> {
assert!(self.cursor <= self.source.len());
let slice = self.slice_from_cursor().unwrap();
let mut pos = self.cursor;
let mut chars = slice.chars();
let mut quote_pos = None;
while let Some(ch) = chars.next() {
match ch {
'"' | '\'' => {
self.in_quote = true;
quote_pos = Some(pos);
pos += 1;
while let Some(q_ch) = chars.next() {
match q_ch {
'\\' => {
pos += 2;
chars.next();
}
_ if q_ch == ch => {
pos += 1;
self.in_quote = false;
break
}
// Any time an ambiguous character is found
// we must push the cursor by the length of the character
// instead of just assuming a length of 1.
// Allows spans to work for wide characters
_ => pos += q_ch.len_utf8()
}
}
}
_ if self.flags.contains(LexFlags::RAW) => {
if ch.is_whitespace() {
break;
} else {
pos += ch.len_utf8()
}
}
_ if !self.in_quote && is_op(ch) => break,
_ if is_hard_sep(ch) => break,
_ => pos += ch.len_utf8()
}
}
let mut new_tk = self.get_token(self.cursor..pos, TkRule::Str);
if self.in_quote && !self.flags.contains(LexFlags::LEX_UNFINISHED) {
new_tk.set_err(
quote_pos.unwrap()..pos,
self.source,
TkErr::UntermQuote
);
}
if self.flags.contains(LexFlags::NEXT_IS_CMD) {
if KEYWORDS.contains(&new_tk.span.as_str()) {
new_tk.flags |= TkFlags::KEYWORD;
} else if is_assignment(&new_tk.span.as_str()) {
new_tk.flags |= TkFlags::ASSIGN;
} else {
new_tk.flags |= TkFlags::IS_CMD;
flog!(TRACE, new_tk.span.as_str());
if BUILTINS.contains(&new_tk.span.as_str()) {
new_tk.flags |= TkFlags::BUILTIN;
}
flog!(TRACE, new_tk.flags);
self.next_is_not_cmd();
}
}
self.cursor = pos;
new_tk
}
pub fn get_token(&self, range: Range<usize>, class: TkRule) -> Tk<'t> {
let span = Span::new(range, self.source);
Tk::new(class, span)
}
}
impl<'t> Iterator for LexStream<'t> {
type Item = Tk<'t>;
fn next(&mut self) -> Option<Self::Item> {
assert!(self.cursor <= self.source.len());
// We are at the end of the input
if self.cursor == self.source.len() {
if self.flags.contains(LexFlags::STALE) {
// We've already returned an EOI token, nothing left to do
return None
} else {
// Return the EOI token
let token = self.get_token(self.cursor..self.cursor, TkRule::EOI);
self.flags |= LexFlags::STALE;
return Some(token)
}
}
// Return the SOI token
if self.flags.contains(LexFlags::FRESH) {
self.flags &= !LexFlags::FRESH;
let token = self.get_token(self.cursor..self.cursor, TkRule::SOI);
return Some(token)
}
// If we are just reading raw words, short circuit here
// Used for word splitting variable values
if self.flags.contains(LexFlags::RAW) {
return Some(self.read_string())
}
loop {
let pos = self.cursor;
if self.slice(pos..pos+2) == Some("\\\n") {
self.cursor += 2;
} else if pos < self.source.len() && is_field_sep(get_char(self.source, pos).unwrap()) {
self.cursor += 1;
} else {
break
}
}
if self.cursor == self.source.len() {
return None
}
let token = match get_char(self.source, self.cursor).unwrap() {
'\r' | '\n' | ';' => {
let ch_idx = self.cursor;
self.cursor += 1;
self.next_is_cmd();
while let Some(ch) = get_char(self.source, self.cursor) {
if is_hard_sep(ch) { // Combine consecutive separators into one, including whitespace
self.cursor += 1;
} else {
break
}
}
self.get_token(ch_idx..self.cursor, TkRule::Sep)
}
'#' => {
let ch_idx = self.cursor;
self.cursor += 1;
while let Some(ch) = get_char(self.source, self.cursor) {
self.cursor += 1;
if ch == '\n' {
break
}
}
self.get_token(ch_idx..self.cursor, TkRule::Comment)
}
'|' => {
let ch_idx = self.cursor;
self.cursor += 1;
self.next_is_cmd();
let tk_type = if let Some('|') = get_char(self.source, self.cursor) {
self.cursor += 1;
TkRule::Or
} else if let Some('&') = get_char(self.source, self.cursor) {
self.cursor += 1;
TkRule::ErrPipe
} else {
TkRule::Pipe
};
self.get_token(ch_idx..self.cursor, tk_type)
}
'&' => {
let ch_idx = self.cursor;
self.cursor += 1;
self.next_is_cmd();
let tk_type = if let Some('&') = get_char(self.source, self.cursor) {
self.cursor += 1;
TkRule::And
} else {
TkRule::Bg
};
self.get_token(ch_idx..self.cursor, tk_type)
}
_ => {
if let Some(tk) = self.read_redir() {
self.next_is_not_cmd();
tk
} else {
self.read_string()
}
}
};
Some(token)
}
}
pub fn get_char(src: &str, idx: usize) -> Option<char> {
src.get(idx..)?.chars().next()
}
pub fn is_assignment(text: &str) -> bool {
let mut chars = text.chars();
while let Some(ch) = chars.next() {
match ch {
'\\' => { chars.next(); }
'=' => return true,
_ => continue
}
}
false
}
/// Is '|', '&', '>', or '<'
pub fn is_op(ch: char) -> bool {
matches!(ch, '|' | '&' | '>' | '<')
}
/// Is whitespace or a semicolon
pub fn is_hard_sep(ch: char) -> bool {
matches!(ch, ' ' | '\t' | '\n' | ';')
}
/// Is whitespace, but not a newline
pub fn is_field_sep(ch: char) -> bool {
matches!(ch, ' ' | '\t')
}