use std::{fmt::Display, ops::{Bound, Deref, Range, RangeBounds}}; use bitflags::bitflags; use crate::{builtin::BUILTINS, prelude::*}; pub const KEYWORDS: [&'static str;14] = [ "if", "then", "elif", "else", "fi", "while", "until", "select", "for", "in", "do", "done", "case", "esac", ]; pub const OPENERS: [&'static str;6] = [ "if", "while", "until", "for", "select", "case" ]; #[derive(Clone,PartialEq,Default,Debug)] pub struct Span<'s> { range: Range, source: &'s str } impl<'s> Span<'s> { /// New `Span`. Wraps a range and a string slice that it refers to. pub fn new(range: Range, source: &'s str) -> Self { Span { range, source, } } /// Slice the source string at the wrapped range pub fn as_str(&self) -> &str { &self.source[self.start..self.end] } pub fn get_source(&'s self) -> &'s str { self.source } pub fn range(&self) -> Range { self.range.clone() } } /// Allows simple access to the underlying range wrapped by the span impl<'s> Deref for Span<'s> { type Target = Range; fn deref(&self) -> &Self::Target { &self.range } } #[derive(Clone,PartialEq,Debug)] pub enum TkRule { Null, SOI, // Start-of-Input Str, Pipe, ErrPipe, And, Or, Bg, Sep, Redir, Expanded { exp: Vec }, Comment, EOI, // End-of-Input } impl Default for TkRule { fn default() -> Self { TkRule::Null } } #[derive(Clone,Copy,PartialEq,Debug)] pub enum TkErr { Null, UntermQuote, UntermSubsh, UntermEscape, UntermBrace, BadRedir, BadPipe, HangingDelim, } impl Default for TkErr { fn default() -> Self { TkErr::Null } } pub enum TkState { Raw, } #[derive(Clone,Debug,PartialEq,Default)] pub struct Tk<'s> { pub class: TkRule, pub err_span: Option>, pub err: TkErr, pub span: Span<'s>, pub flags: TkFlags } // There's one impl here and then another in expand.rs which has the expansion logic impl<'s> Tk<'s> { pub fn new(class: TkRule, span: Span<'s>) -> Self { Self { class, err_span: None, err: TkErr::Null, span, flags: TkFlags::empty() } } pub fn to_string(&self) -> String { match &self.class { TkRule::Expanded { exp } => exp.join(" "), _ => self.span.as_str().to_string() } } pub fn set_err(&mut self, range: Range, slice: &'s str, err: TkErr) { self.err_span = Some(Span::new(range, slice)); self.err = err } pub fn is_err(&self) -> bool { self.err_span.is_some() } pub fn source(&self) -> &'s str { self.span.source } } impl<'s> Display for Tk<'s> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match &self.class { TkRule::Expanded { exp } => write!(f,"{}",exp.join(" ")), _ => write!(f,"{}",self.span.as_str()) } } } bitflags! { #[derive(Debug,Clone,Copy,PartialEq,Default)] pub struct TkFlags: u32 { const KEYWORD = 0b0000000000000001; /// This is a keyword that opens a new block statement, like 'if' and 'while' const OPENER = 0b0000000000000010; const IS_CMD = 0b0000000000000100; const IS_OP = 0b0000000000001000; const ASSIGN = 0b0000000000010000; const BUILTIN = 0b0000000000100000; } } pub struct LexStream<'t> { source: &'t str, pub cursor: usize, in_quote: bool, flags: LexFlags, } bitflags! { pub struct LexFlags: u32 { /// Return comment tokens const LEX_COMMENTS = 0b00000001; /// Allow unfinished input const LEX_UNFINISHED = 0b00000010; /// The next string-type token is a command name const NEXT_IS_CMD = 0b00000100; /// We are in a quotation, so quoting rules apply const IN_QUOTE = 0b00001000; /// Only lex strings; used in expansions const RAW = 0b00010000; /// The lexer has not produced any tokens yet const FRESH = 0b00010000; /// The lexer has no more tokens to produce const STALE = 0b00100000; } } impl<'t> LexStream<'t> { pub fn new(source: &'t str, flags: LexFlags) -> Self { flog!(TRACE, "new lex stream"); let flags = flags | LexFlags::FRESH | LexFlags::NEXT_IS_CMD; Self { source, cursor: 0, in_quote: false, flags } } /// Returns a slice of the source input using the given range /// Returns None if the range is out of the bounds of the string slice /// /// Works with any kind of range /// examples: /// `LexStream.slice(1..10)` /// `LexStream.slice(1..=10)` /// `LexStream.slice(..10)` /// `LexStream.slice(1..)` /// pub fn slice>(&self, range: R) -> Option<&'t str> { // Sketchy downcast let start = match range.start_bound() { Bound::Included(&start) => start, Bound::Excluded(&start) => start + 1, Bound::Unbounded => 0 }; let end = match range.end_bound() { Bound::Included(&end) => end, Bound::Excluded(&end) => end + 1, Bound::Unbounded => self.source.len() }; self.source.get(start..end) } pub fn slice_from_cursor(&self) -> Option<&'t str> { self.slice(self.cursor..) } /// The next string token is a command name pub fn next_is_cmd(&mut self) { self.flags |= LexFlags::NEXT_IS_CMD; } /// The next string token is not a command name pub fn next_is_not_cmd(&mut self) { self.flags &= !LexFlags::NEXT_IS_CMD; } pub fn read_redir(&mut self) -> Option> { assert!(self.cursor <= self.source.len()); let slice = self.slice(self.cursor..)?; let mut pos = self.cursor; let mut chars = slice.chars().peekable(); let mut tk = Tk::default(); while let Some(ch) = chars.next() { match ch { '>' => { pos += 1; if let Some('>') = chars.peek() { chars.next(); pos += 1; } if let Some('&') = chars.peek() { chars.next(); pos += 1; let mut found_fd = false; while chars.peek().is_some_and(|ch| ch.is_ascii_digit()) { chars.next(); found_fd = true; pos += 1; } if !found_fd { let err = TkErr::BadRedir; tk = self.get_token(self.cursor..pos, TkRule::Redir); tk.set_err(self.cursor..pos, self.source, err); break } else { tk = self.get_token(self.cursor..pos, TkRule::Redir); break } } else { tk = self.get_token(self.cursor..pos, TkRule::Redir); break } } '<' => { pos += 1; for _ in 0..2 { if let Some('<') = chars.peek() { chars.next(); pos += 1; } else { break } } tk = self.get_token(self.cursor..pos, TkRule::Redir); break } '0'..='9' => { pos += 1; while chars.peek().is_some_and(|ch| ch.is_ascii_digit()) { chars.next(); pos += 1; } } _ => { return None; } } } assert!(tk != Tk::default()); self.cursor = pos; Some(tk) } pub fn read_string(&mut self) -> Tk<'t> { assert!(self.cursor <= self.source.len()); let slice = self.slice_from_cursor().unwrap(); let mut pos = self.cursor; let mut chars = slice.chars(); let mut quote_pos = None; while let Some(ch) = chars.next() { match ch { '"' | '\'' => { self.in_quote = true; quote_pos = Some(pos); pos += 1; while let Some(q_ch) = chars.next() { match q_ch { '\\' => { pos += 2; chars.next(); } _ if q_ch == ch => { pos += 1; self.in_quote = false; break } // Any time an ambiguous character is found // we must push the cursor by the length of the character // instead of just assuming a length of 1. // Allows spans to work for wide characters _ => pos += q_ch.len_utf8() } } } _ if self.flags.contains(LexFlags::RAW) => { if ch.is_whitespace() { break; } else { pos += ch.len_utf8() } } _ if !self.in_quote && is_op(ch) => break, _ if is_hard_sep(ch) => break, _ => pos += ch.len_utf8() } } let mut new_tk = self.get_token(self.cursor..pos, TkRule::Str); if self.in_quote && !self.flags.contains(LexFlags::LEX_UNFINISHED) { new_tk.set_err( quote_pos.unwrap()..pos, self.source, TkErr::UntermQuote ); } if self.flags.contains(LexFlags::NEXT_IS_CMD) { if KEYWORDS.contains(&new_tk.span.as_str()) { new_tk.flags |= TkFlags::KEYWORD; } else if is_assignment(&new_tk.span.as_str()) { new_tk.flags |= TkFlags::ASSIGN; } else { new_tk.flags |= TkFlags::IS_CMD; flog!(TRACE, new_tk.span.as_str()); if BUILTINS.contains(&new_tk.span.as_str()) { new_tk.flags |= TkFlags::BUILTIN; } flog!(TRACE, new_tk.flags); self.next_is_not_cmd(); } } self.cursor = pos; new_tk } pub fn get_token(&self, range: Range, class: TkRule) -> Tk<'t> { let span = Span::new(range, self.source); Tk::new(class, span) } } impl<'t> Iterator for LexStream<'t> { type Item = Tk<'t>; fn next(&mut self) -> Option { assert!(self.cursor <= self.source.len()); // We are at the end of the input if self.cursor == self.source.len() { if self.flags.contains(LexFlags::STALE) { // We've already returned an EOI token, nothing left to do return None } else { // Return the EOI token let token = self.get_token(self.cursor..self.cursor, TkRule::EOI); self.flags |= LexFlags::STALE; return Some(token) } } // Return the SOI token if self.flags.contains(LexFlags::FRESH) { self.flags &= !LexFlags::FRESH; let token = self.get_token(self.cursor..self.cursor, TkRule::SOI); return Some(token) } // If we are just reading raw words, short circuit here // Used for word splitting variable values if self.flags.contains(LexFlags::RAW) { return Some(self.read_string()) } loop { let pos = self.cursor; if self.slice(pos..pos+2) == Some("\\\n") { self.cursor += 2; } else if pos < self.source.len() && is_field_sep(get_char(self.source, pos).unwrap()) { self.cursor += 1; } else { break } } if self.cursor == self.source.len() { return None } let token = match get_char(self.source, self.cursor).unwrap() { '\r' | '\n' | ';' => { let ch_idx = self.cursor; self.cursor += 1; self.next_is_cmd(); while let Some(ch) = get_char(self.source, self.cursor) { if is_hard_sep(ch) { // Combine consecutive separators into one, including whitespace self.cursor += 1; } else { break } } self.get_token(ch_idx..self.cursor, TkRule::Sep) } '#' => { let ch_idx = self.cursor; self.cursor += 1; while let Some(ch) = get_char(self.source, self.cursor) { self.cursor += 1; if ch == '\n' { break } } self.get_token(ch_idx..self.cursor, TkRule::Comment) } '|' => { let ch_idx = self.cursor; self.cursor += 1; self.next_is_cmd(); let tk_type = if let Some('|') = get_char(self.source, self.cursor) { self.cursor += 1; TkRule::Or } else if let Some('&') = get_char(self.source, self.cursor) { self.cursor += 1; TkRule::ErrPipe } else { TkRule::Pipe }; self.get_token(ch_idx..self.cursor, tk_type) } '&' => { let ch_idx = self.cursor; self.cursor += 1; self.next_is_cmd(); let tk_type = if let Some('&') = get_char(self.source, self.cursor) { self.cursor += 1; TkRule::And } else { TkRule::Bg }; self.get_token(ch_idx..self.cursor, tk_type) } _ => { if let Some(tk) = self.read_redir() { self.next_is_not_cmd(); tk } else { self.read_string() } } }; Some(token) } } pub fn get_char(src: &str, idx: usize) -> Option { src.get(idx..)?.chars().next() } pub fn is_assignment(text: &str) -> bool { let mut chars = text.chars(); while let Some(ch) = chars.next() { match ch { '\\' => { chars.next(); } '=' => return true, _ => continue } } false } /// Is '|', '&', '>', or '<' pub fn is_op(ch: char) -> bool { matches!(ch, '|' | '&' | '>' | '<') } /// Is whitespace or a semicolon pub fn is_hard_sep(ch: char) -> bool { matches!(ch, ' ' | '\t' | '\n' | ';') } /// Is whitespace, but not a newline pub fn is_field_sep(ch: char) -> bool { matches!(ch, ' ' | '\t') }