use std::{collections::VecDeque, fmt::Display, iter::Peekable, ops::{Bound, Deref, Range, RangeBounds}, str::Chars, sync::Arc}; use bitflags::bitflags; use crate::{builtin::BUILTINS, libsh::{error::{ShErr, ShErrKind, ShResult}, utils::CharDequeUtils}, prelude::*}; pub const KEYWORDS: [&'static str;14] = [ "if", "then", "elif", "else", "fi", "while", "until", "select", "for", "in", "do", "done", "case", "esac", ]; pub const OPENERS: [&'static str;6] = [ "if", "while", "until", "for", "select", "case" ]; #[derive(Clone,PartialEq,Default,Debug)] pub struct Span { range: Range, source: Arc } impl Span { /// New `Span`. Wraps a range and a string slice that it refers to. pub fn new(range: Range, source: Arc) -> Self { Span { range, source, } } /// Slice the source string at the wrapped range pub fn as_str(&self) -> &str { &self.source[self.start..self.end] } pub fn get_source(&self) -> Arc { self.source.clone() } pub fn range(&self) -> Range { self.range.clone() } } /// Allows simple access to the underlying range wrapped by the span impl Deref for Span { type Target = Range; fn deref(&self) -> &Self::Target { &self.range } } #[derive(Clone,PartialEq,Debug)] pub enum TkRule { Null, SOI, // Start-of-Input Str, Pipe, ErrPipe, And, Or, Bg, Sep, Redir, CasePattern, BraceGrpStart, BraceGrpEnd, Expanded { exp: Vec }, Comment, EOI, // End-of-Input } impl Default for TkRule { fn default() -> Self { TkRule::Null } } #[derive(Clone,Debug,PartialEq,Default)] pub struct Tk { pub class: TkRule, pub span: Span, pub flags: TkFlags } // There's one impl here and then another in expand.rs which has the expansion logic impl Tk { pub fn new(class: TkRule, span: Span) -> Self { Self { class, span, flags: TkFlags::empty() } } pub fn to_string(&self) -> String { match &self.class { TkRule::Expanded { exp } => exp.join(" "), _ => self.span.as_str().to_string() } } pub fn source(&self) -> Arc { self.span.source.clone() } /// Used to see if a separator is ';;' for case statements pub fn has_double_semi(&self) -> bool { let TkRule::Sep = self.class else { return false; }; self.span.as_str().trim() == ";;" } } impl Display for Tk { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match &self.class { TkRule::Expanded { exp } => write!(f,"{}",exp.join(" ")), _ => write!(f,"{}",self.span.as_str()) } } } bitflags! { #[derive(Debug,Clone,Copy,PartialEq,Default)] pub struct TkFlags: u32 { const KEYWORD = 0b0000000000000001; /// This is a keyword that opens a new block statement, like 'if' and 'while' const OPENER = 0b0000000000000010; const IS_CMD = 0b0000000000000100; const IS_SUBSH = 0b0000000000001000; const IS_OP = 0b0000000000010000; const ASSIGN = 0b0000000000100000; const BUILTIN = 0b0000000001000000; } } pub struct LexStream { source: Arc, pub cursor: usize, in_quote: bool, flags: LexFlags, } bitflags! { #[derive(Debug)] pub struct LexFlags: u32 { /// Return comment tokens const LEX_COMMENTS = 0b00000001; /// Allow unfinished input const LEX_UNFINISHED = 0b00000010; /// The next string-type token is a command name const NEXT_IS_CMD = 0b00000100; /// We are in a quotation, so quoting rules apply const IN_QUOTE = 0b00001000; /// Only lex strings; used in expansions const RAW = 0b00010000; /// The lexer has not produced any tokens yet const FRESH = 0b00010000; /// The lexer has no more tokens to produce const STALE = 0b00100000; /// The lexer's cursor is in a brace group const IN_BRC_GRP = 0b01000000; const EXPECTING_IN = 0b10000000; } } impl LexStream { pub fn new(source: Arc, flags: LexFlags) -> Self { flog!(TRACE, "new lex stream"); let flags = flags | LexFlags::FRESH | LexFlags::NEXT_IS_CMD; Self { source, cursor: 0, in_quote: false, flags } } /// Returns a slice of the source input using the given range /// Returns None if the range is out of the bounds of the string slice /// /// Works with any kind of range /// examples: /// `LexStream.slice(1..10)` /// `LexStream.slice(1..=10)` /// `LexStream.slice(..10)` /// `LexStream.slice(1..)` /// pub fn slice>(&self, range: R) -> Option<&str> { // Sketchy downcast let start = match range.start_bound() { Bound::Included(&start) => start, Bound::Excluded(&start) => start + 1, Bound::Unbounded => 0 }; let end = match range.end_bound() { Bound::Included(&end) => end, Bound::Excluded(&end) => end + 1, Bound::Unbounded => self.source.len() }; self.source.get(start..end) } pub fn slice_from_cursor(&self) -> Option<&str> { self.slice(self.cursor..) } pub fn in_brc_grp(&self) -> bool { self.flags.contains(LexFlags::IN_BRC_GRP) } pub fn set_in_brc_grp(&mut self, is: bool) { if is { self.flags |= LexFlags::IN_BRC_GRP; } else { self.flags &= !LexFlags::IN_BRC_GRP; } } pub fn next_is_cmd(&self) -> bool { self.flags.contains(LexFlags::NEXT_IS_CMD) } /// Set whether the next string token is a command name pub fn set_next_is_cmd(&mut self, is: bool) { if is { self.flags |= LexFlags::NEXT_IS_CMD; } else { self.flags &= !LexFlags::NEXT_IS_CMD; } } pub fn read_redir(&mut self) -> Option> { assert!(self.cursor <= self.source.len()); let slice = self.slice(self.cursor..)?; let mut pos = self.cursor; let mut chars = slice.chars().peekable(); let mut tk = Tk::default(); while let Some(ch) = chars.next() { match ch { '>' => { pos += 1; if let Some('>') = chars.peek() { chars.next(); pos += 1; } if let Some('&') = chars.peek() { chars.next(); pos += 1; let mut found_fd = false; while chars.peek().is_some_and(|ch| ch.is_ascii_digit()) { chars.next(); found_fd = true; pos += 1; } if !found_fd { return Some(Err( ShErr::full( ShErrKind::ParseErr, "Invalid redirection", Span::new(self.cursor..pos, self.source.clone()).into() ) )); } else { tk = self.get_token(self.cursor..pos, TkRule::Redir); break } } else { tk = self.get_token(self.cursor..pos, TkRule::Redir); break } } '<' => { pos += 1; for _ in 0..2 { if let Some('<') = chars.peek() { chars.next(); pos += 1; } else { break } } tk = self.get_token(self.cursor..pos, TkRule::Redir); break } '0'..='9' => { pos += 1; while chars.peek().is_some_and(|ch| ch.is_ascii_digit()) { chars.next(); pos += 1; } } _ => { return None; } } } if tk == Tk::default() { return None } self.cursor = pos; Some(Ok(tk)) } pub fn read_string(&mut self) -> ShResult { assert!(self.cursor <= self.source.len()); let slice = self.slice_from_cursor().unwrap().to_string(); let mut pos = self.cursor; let mut chars = slice.chars().peekable(); if let Some(count) = case_pat_lookahead(chars.clone()) { pos += count; let casepat_tk = self.get_token(self.cursor..pos, TkRule::CasePattern); self.cursor = pos; self.set_next_is_cmd(true); return Ok(casepat_tk) } while let Some(ch) = chars.next() { match ch { _ if self.flags.contains(LexFlags::RAW) => { if ch.is_whitespace() { break; } else { pos += ch.len_utf8() } } '\\' => { pos += 1; if let Some(ch) = chars.next() { pos += ch.len_utf8(); } } '$' if chars.peek() == Some(&'(') => { pos += 2; chars.next(); let mut paren_stack = vec!['(']; let paren_pos = pos; while let Some(ch) = chars.next() { match ch { '\\' => { pos += 1; if let Some(next_ch) = chars.next() { pos += next_ch.len_utf8(); } } '(' => { pos += 1; paren_stack.push(ch); } ')' => { pos += 1; paren_stack.pop(); if paren_stack.is_empty() { break } } _ => pos += ch.len_utf8() } } if !paren_stack.is_empty() { return Err( ShErr::full( ShErrKind::ParseErr, "Unclosed subshell", Span::new(paren_pos..paren_pos + 1, self.source.clone()) ) ) } } '(' if self.next_is_cmd() => { let mut paren_stack = vec!['(']; let paren_pos = pos; while let Some(ch) = chars.next() { pos += ch.len_utf8(); match ch { '\\' => { if let Some(next_ch) = chars.next() { pos += next_ch.len_utf8(); } } '(' => { pos += 1; paren_stack.push(ch); } ')' => { pos += 1; paren_stack.pop(); if paren_stack.is_empty() { break } } _ => continue } } if !paren_stack.is_empty() { return Err( ShErr::full( ShErrKind::ParseErr, "Unclosed subshell", Span::new(paren_pos..paren_pos + 1, self.source.clone()) ) ) } let mut subsh_tk = self.get_token(self.cursor..pos, TkRule::Str); subsh_tk.flags |= TkFlags::IS_CMD; subsh_tk.flags |= TkFlags::IS_SUBSH; self.cursor = pos; self.set_next_is_cmd(true); return Ok(subsh_tk) } '{' if pos == self.cursor && self.next_is_cmd() => { pos += 1; let mut tk = self.get_token(self.cursor..pos, TkRule::BraceGrpStart); tk.flags |= TkFlags::IS_CMD; self.set_in_brc_grp(true); self.set_next_is_cmd(true); self.cursor = pos; return Ok(tk) } '}' if pos == self.cursor && self.in_brc_grp() => { pos += 1; let tk = self.get_token(self.cursor..pos, TkRule::BraceGrpEnd); self.set_in_brc_grp(false); self.set_next_is_cmd(true); self.cursor = pos; return Ok(tk) } '"' | '\'' => { self.in_quote = true; pos += 1; while let Some(q_ch) = chars.next() { match q_ch { '\\' => { pos += 1; if chars.next().is_some() { pos += 1; } } _ if q_ch == ch => { pos += 1; self.in_quote = false; break } // Any time an ambiguous character is found // we must push the cursor by the length of the character // instead of just assuming a length of 1. // Allows spans to work for wide characters _ => pos += q_ch.len_utf8() } } } _ if !self.in_quote && is_op(ch) => break, _ if is_hard_sep(ch) => break, _ => pos += ch.len_utf8() } } let mut new_tk = self.get_token(self.cursor..pos, TkRule::Str); if self.in_quote && !self.flags.contains(LexFlags::LEX_UNFINISHED) { return Err( ShErr::full( ShErrKind::ParseErr, "Unterminated quote", new_tk.span.into(), ) ); } // TODO: clean up this mess if self.flags.contains(LexFlags::NEXT_IS_CMD) { if is_keyword(&new_tk.span.as_str()) { if matches!(new_tk.span.as_str(), "case" | "select" | "for") { self.flags |= LexFlags::EXPECTING_IN; new_tk.flags |= TkFlags::KEYWORD; self.set_next_is_cmd(false); } else { new_tk.flags |= TkFlags::KEYWORD; } } else if is_assignment(&new_tk.span.as_str()) { new_tk.flags |= TkFlags::ASSIGN; } else { if self.flags.contains(LexFlags::EXPECTING_IN) { if new_tk.span.as_str() != "in" { new_tk.flags |= TkFlags::IS_CMD; } else { new_tk.flags |= TkFlags::KEYWORD; self.flags &= !LexFlags::EXPECTING_IN; } } else { new_tk.flags |= TkFlags::IS_CMD; } if BUILTINS.contains(&new_tk.span.as_str()) { new_tk.flags |= TkFlags::BUILTIN; } self.set_next_is_cmd(false); } } else if self.flags.contains(LexFlags::EXPECTING_IN) { if new_tk.span.as_str() == "in" { new_tk.flags |= TkFlags::KEYWORD; self.flags &= !LexFlags::EXPECTING_IN; } } self.cursor = pos; Ok(new_tk) } pub fn get_token(&self, range: Range, class: TkRule) -> Tk { let span = Span::new(range, self.source.clone()); Tk::new(class, span) } } impl Iterator for LexStream { type Item = ShResult; fn next(&mut self) -> Option { assert!(self.cursor <= self.source.len()); // We are at the end of the input if self.cursor == self.source.len() { if self.flags.contains(LexFlags::STALE) { // We've already returned an EOI token, nothing left to do return None } else { // Return the EOI token let token = self.get_token(self.cursor..self.cursor, TkRule::EOI); self.flags |= LexFlags::STALE; return Some(Ok(token)) } } // Return the SOI token if self.flags.contains(LexFlags::FRESH) { self.flags &= !LexFlags::FRESH; let token = self.get_token(self.cursor..self.cursor, TkRule::SOI); return Some(Ok(token)) } // If we are just reading raw words, short circuit here // Used for word splitting variable values if self.flags.contains(LexFlags::RAW) { return Some(self.read_string()) } loop { let pos = self.cursor; if self.slice(pos..pos+2) == Some("\\\n") { self.cursor += 2; } else if pos < self.source.len() && is_field_sep(get_char(&self.source, pos).unwrap()) { self.cursor += 1; } else { break } } if self.cursor == self.source.len() { return None } let token = match get_char(&self.source, self.cursor).unwrap() { '\r' | '\n' | ';' => { let ch_idx = self.cursor; self.cursor += 1; self.set_next_is_cmd(true); while let Some(ch) = get_char(&self.source, self.cursor) { if is_hard_sep(ch) { // Combine consecutive separators into one, including whitespace self.cursor += 1; } else { break } } self.get_token(ch_idx..self.cursor, TkRule::Sep) } '#' => { let ch_idx = self.cursor; self.cursor += 1; while let Some(ch) = get_char(&self.source, self.cursor) { self.cursor += 1; if ch == '\n' { break } } self.get_token(ch_idx..self.cursor, TkRule::Comment) } '|' => { let ch_idx = self.cursor; self.cursor += 1; self.set_next_is_cmd(true); let tk_type = if let Some('|') = get_char(&self.source, self.cursor) { self.cursor += 1; TkRule::Or } else if let Some('&') = get_char(&self.source, self.cursor) { self.cursor += 1; TkRule::ErrPipe } else { TkRule::Pipe }; self.get_token(ch_idx..self.cursor, tk_type) } '&' => { let ch_idx = self.cursor; self.cursor += 1; self.set_next_is_cmd(true); let tk_type = if let Some('&') = get_char(&self.source, self.cursor) { self.cursor += 1; TkRule::And } else { TkRule::Bg }; self.get_token(ch_idx..self.cursor, tk_type) } _ => { if let Some(tk) = self.read_redir() { self.set_next_is_cmd(false); match tk { Ok(tk) => tk, Err(e) => return Some(Err(e)) } } else { match self.read_string() { Ok(tk) => tk, Err(e) => return Some(Err(e)) } } } }; Some(Ok(token)) } } pub fn get_char(src: &str, idx: usize) -> Option { src.get(idx..)?.chars().next() } pub fn is_assignment(text: &str) -> bool { let mut chars = text.chars(); while let Some(ch) = chars.next() { match ch { '\\' => { chars.next(); } '=' => return true, _ => continue } } false } /// Is '|', '&', '>', or '<' pub fn is_op(ch: char) -> bool { matches!(ch, '|' | '&' | '>' | '<') } /// Is whitespace or a semicolon pub fn is_hard_sep(ch: char) -> bool { matches!(ch, ' ' | '\t' | '\n' | ';') } /// Is whitespace, but not a newline pub fn is_field_sep(ch: char) -> bool { matches!(ch, ' ' | '\t') } pub fn is_keyword(slice: &str) -> bool { KEYWORDS.contains(&slice) || (slice.ends_with("()") && !slice.ends_with("\\()")) } pub fn lookahead(pat: &str, mut chars: Chars) -> Option { let mut pos = 0; let mut char_deque = VecDeque::new(); while let Some(ch) = chars.next() { char_deque.push_back(ch); if char_deque.len() > pat.len() { char_deque.pop_front(); } if char_deque.starts_with(pat) { return Some(pos) } pos += 1; } None } pub fn case_pat_lookahead(mut chars: Peekable) -> Option { let mut pos = 0; while let Some(ch) = chars.next() { pos += 1; match ch { _ if is_hard_sep(ch) => return None, '\\' => { chars.next(); } ')' => return Some(pos), '(' => return None, _ => { /* continue */ } } } None }