shed/src/parse/lex.rs

use std::{
  collections::VecDeque,
  fmt::Display,
  iter::Peekable,
  ops::{Bound, Range, RangeBounds},
  str::Chars,
  sync::Arc,
};

use bitflags::bitflags;

use crate::{
  builtin::BUILTINS,
  libsh::{
    error::{ShErr, ShErrKind, ShResult},
    utils::CharDequeUtils,
  },
};

pub const KEYWORDS: [&str; 17] = [
  "if", "then", "elif", "else", "fi", "while", "until", "select", "for", "in", "do", "done",
  "case", "esac", "[[", "]]", "!",
];

pub const OPENERS: [&str; 6] = ["if", "while", "until", "for", "select", "case"];

/// Used to track whether the lexer is currently inside a quote, and if so, which type
#[derive(Default, Debug)]
pub enum QuoteState {
  #[default]
  Outside,
  Single,
  Double,
}

impl QuoteState {
  pub fn outside(&self) -> bool {
    matches!(self, QuoteState::Outside)
  }
  pub fn in_single(&self) -> bool {
    matches!(self, QuoteState::Single)
  }
  pub fn in_double(&self) -> bool {
    matches!(self, QuoteState::Double)
  }
  pub fn in_quote(&self) -> bool {
    !self.outside()
  }
  /// Toggles whether we are in a double quote. If self = QuoteState::Single, this does nothing, since double quotes inside single quotes are just literal characters
  pub fn toggle_double(&mut self) {
    match self {
      QuoteState::Outside => *self = QuoteState::Double,
      QuoteState::Double => *self = QuoteState::Outside,
      _ => {}
    }
  }
  /// Toggles whether we are in a single quote. If self == QuoteState::Double, this does nothing, since single quotes are not interpreted inside double quotes
  pub fn toggle_single(&mut self) {
    match self {
      QuoteState::Outside => *self = QuoteState::Single,
      QuoteState::Single => *self = QuoteState::Outside,
      _ => {}
    }
  }
}

#[derive(Clone, PartialEq, Default, Debug, Eq, Hash)]
pub struct SpanSource {
  name: String,
  content: Arc<String>,
}

impl SpanSource {
  pub fn name(&self) -> &str {
    &self.name
  }
  pub fn content(&self) -> Arc<String> {
    self.content.clone()
  }
  pub fn rename(&mut self, name: String) {
    self.name = name;
  }
}

impl Display for SpanSource {
  fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    write!(f, "{}", self.name)
  }
}

/// Span::new(10..20)
#[derive(Clone, PartialEq, Default, Debug)]
pub struct Span {
  range: Range<usize>,
  source: SpanSource,
}

impl Span {
  /// New `Span`. Wraps a range and a string slice that it refers to.
  pub fn new(range: Range<usize>, source: Arc<String>) -> Self {
    let source = SpanSource {
      name: "<stdin>".into(),
      content: source,
    };
    Span { range, source }
  }
  pub fn from_span_source(range: Range<usize>, source: SpanSource) -> Self {
    Span { range, source }
  }
  pub fn rename(&mut self, name: String) {
    self.source.name = name;
  }
  pub fn with_name(mut self, name: String) -> Self {
    self.source.name = name;
    self
  }
  pub fn line_and_col(&self) -> (usize, usize) {
    let content = self.source.content();
    let source = ariadne::Source::from(content.as_str());
    let (_, line, col) = source.get_byte_line(self.range.start).unwrap();
    (line, col)
  }
  /// Slice the source string at the wrapped range
  pub fn as_str(&self) -> &str {
    &self.source.content[self.range().start..self.range().end]
  }
  pub fn get_source(&self) -> Arc<String> {
    self.source.content.clone()
  }
  pub fn span_source(&self) -> &SpanSource {
    &self.source
  }
  pub fn range(&self) -> Range<usize> {
    self.range.clone()
  }
  /// With great power comes great responsibility
  /// Only use this in the most dire of circumstances
  pub fn set_range(&mut self, range: Range<usize>) {
    self.range = range;
  }
}

impl ariadne::Span for Span {
  type SourceId = SpanSource;

  fn source(&self) -> &Self::SourceId {
    &self.source
  }

  fn start(&self) -> usize {
    self.range.start
  }

  fn end(&self) -> usize {
    self.range.end
  }
}

/// Allows simple access to the underlying range wrapped by the span
#[derive(Clone, PartialEq, Debug)]
pub enum TkRule {
  Null,
  SOI, // Start-of-Input
  Str,
  Pipe,
  ErrPipe,
  And,
  Or,
  Bang,
  Bg,
  Sep,
  Redir,
  CasePattern,
  BraceGrpStart,
  BraceGrpEnd,
  Expanded { exp: Vec<String> },
  Comment,
  EOI, // End-of-Input
}

impl Default for TkRule {
  fn default() -> Self {
    TkRule::Null
  }
}

#[derive(Clone, Debug, PartialEq, Default)]
pub struct Tk {
  pub class: TkRule,
  pub span: Span,
  pub flags: TkFlags,
}

// There's one impl here and then another in expand.rs which has the expansion
// logic
impl Tk {
  pub fn new(class: TkRule, span: Span) -> Self {
    Self {
      class,
      span,
      flags: TkFlags::empty(),
    }
  }
  pub fn as_str(&self) -> &str {
    self.span.as_str()
  }
  pub fn source(&self) -> Arc<String> {
    self.span.source.content.clone()
  }
  pub fn mark(&mut self, flag: TkFlags) {
    self.flags |= flag;
  }
  /// Used to see if a separator is ';;' for case statements
  pub fn has_double_semi(&self) -> bool {
    let TkRule::Sep = self.class else {
      return false;
    };
    self.span.as_str().trim() == ";;"
  }

	pub fn is_opener(&self) -> bool {
		OPENERS.contains(&self.as_str()) ||
		matches!(self.class, TkRule::BraceGrpStart) ||
		matches!(self.class, TkRule::CasePattern)
	}
	pub fn is_closer(&self) -> bool {
		matches!(self.as_str(), "fi" | "done" | "esac") ||
		self.has_double_semi() ||
		matches!(self.class, TkRule::BraceGrpEnd)
	}

	pub fn is_closer_for(&self, other: &Tk) -> bool {
		if (matches!(other.class, TkRule::BraceGrpStart) && matches!(self.class, TkRule::BraceGrpEnd))
		|| (matches!(other.class, TkRule::CasePattern) && self.has_double_semi()) {
			return true;
		}
		match other.as_str() {
			"for"   |
			"while" |
			"until" => matches!(self.as_str(), "done"),
			"if"    => matches!(self.as_str(), "fi"),
			"case"  => matches!(self.as_str(), "esac"),
			_ => false
		}
	}
}

impl Display for Tk {
  fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    match &self.class {
      TkRule::Expanded { exp } => write!(f, "{}", exp.join(" ")),
      _ => write!(f, "{}", self.span.as_str()),
    }
  }
}

bitflags! {
  #[derive(Debug,Clone,Copy,PartialEq,Default)]
  pub struct TkFlags: u32 {
    const KEYWORD      = 0b0000000000000001;
    /// This is a      keyword that opens a new block statement, like 'if' and 'while'
    const OPENER       = 0b0000000000000010;
    const IS_CMD       = 0b0000000000000100;
    const IS_SUBSH     = 0b0000000000001000;
    const IS_CMDSUB    = 0b0000000000010000;
    const IS_OP        = 0b0000000000100000;
    const ASSIGN       = 0b0000000001000000;
    const BUILTIN      = 0b0000000010000000;
    const IS_PROCSUB   = 0b0000000100000000;
		const IS_HEREDOC   = 0b0000001000000000;
		const LIT_HEREDOC  = 0b0000010000000000;
		const TAB_HEREDOC  = 0b0000100000000000;
  }
}

bitflags! {
  #[derive(Debug, Clone, Copy)]
  pub struct LexFlags: u32 {
    /// The lexer is operating in interactive mode
    const INTERACTIVE    = 0b0000000001;
    /// Allow unfinished input
    const LEX_UNFINISHED = 0b0000000010;
    /// The next string-type token is a command name
    const NEXT_IS_CMD    = 0b0000000100;
    /// We are in a quotation, so quoting rules apply
    const IN_QUOTE       = 0b0000001000;
    /// Only lex strings; used in expansions
    const RAW            = 0b0000010000;
    /// The lexer has not produced any tokens yet
    const FRESH          = 0b0000100000;
    /// The lexer has no more tokens to produce
    const STALE          = 0b0001000000;
    const EXPECTING_IN   = 0b0010000000;
  }
}

pub fn clean_input(input: &str) -> String {
  let mut chars = input.chars().peekable();
  let mut output = String::new();
  while let Some(ch) = chars.next() {
    match ch {
      '\\' if chars.peek() == Some(&'\n') => {
        chars.next();
      }
      '\r' => {
        if chars.peek() == Some(&'\n') {
          chars.next();
        }
        output.push('\n');
      }
      _ => output.push(ch),
    }
  }
  output
}

pub struct LexStream {
  source: Arc<String>,
  pub cursor: usize,
  pub name: String,
  quote_state: QuoteState,
  brc_grp_depth: usize,
  brc_grp_start: Option<usize>,
  case_depth: usize,
	heredoc_skip: Option<usize>,
  flags: LexFlags,
}


impl LexStream {
  pub fn new(source: Arc<String>, flags: LexFlags) -> Self {
    let flags = flags | LexFlags::FRESH | LexFlags::NEXT_IS_CMD;
    Self {
      flags,
      source,
      name: "<stdin>".into(),
      cursor: 0,
      quote_state: QuoteState::default(),
      brc_grp_depth: 0,
      brc_grp_start: None,
			heredoc_skip: None,
      case_depth: 0,
    }
  }
  /// Returns a slice of the source input using the given range
  /// Returns None if the range is out of the bounds of the string slice
  ///
  /// Works with any kind of range
  /// examples:
  /// `LexStream.slice(1..10)`
  /// `LexStream.slice(1..=10)`
  /// `LexStream.slice(..10)`
  /// `LexStream.slice(1..)`
  pub fn slice<R: RangeBounds<usize>>(&self, range: R) -> Option<&str> {
    let start = match range.start_bound() {
      Bound::Included(&start) => start,
      Bound::Excluded(&start) => start + 1,
      Bound::Unbounded => 0,
    };
    let end = match range.end_bound() {
      Bound::Included(&end) => end,
      Bound::Excluded(&end) => end + 1,
      Bound::Unbounded => self.source.len(),
    };
    self.source.get(start..end)
  }
  pub fn with_name(mut self, name: String) -> Self {
    self.name = name;
    self
  }
  pub fn slice_from_cursor(&self) -> Option<&str> {
    self.slice(self.cursor..)
  }
  pub fn in_brc_grp(&self) -> bool {
    self.brc_grp_depth > 0
  }
  pub fn enter_brc_grp(&mut self) {
    if self.brc_grp_depth == 0 {
      self.brc_grp_start = Some(self.cursor);
    }
    self.brc_grp_depth += 1;
  }
  pub fn leave_brc_grp(&mut self) {
    self.brc_grp_depth -= 1;
    if self.brc_grp_depth == 0 {
      self.brc_grp_start = None;
    }
  }
  pub fn next_is_cmd(&self) -> bool {
    self.flags.contains(LexFlags::NEXT_IS_CMD)
  }
  /// Set whether the next string token is a command name
  pub fn set_next_is_cmd(&mut self, is: bool) {
    if is {
      self.flags |= LexFlags::NEXT_IS_CMD;
    } else {
      self.flags &= !LexFlags::NEXT_IS_CMD;
    }
  }
  pub fn read_redir(&mut self) -> Option<ShResult<Tk>> {
    assert!(self.cursor <= self.source.len());
    let slice = self.slice(self.cursor..)?.to_string();
    let mut pos = self.cursor;
    let mut chars = slice.chars().peekable();
    let mut tk = Tk::default();

    while let Some(ch) = chars.next() {
      match ch {
        '>' => {
          if chars.peek() == Some(&'(') {
            return None; // It's a process sub
          }
          pos += 1;
					if let Some('|') = chars.peek() {
						// noclobber force '>|'
						chars.next();
						pos += 1;
						tk = self.get_token(self.cursor..pos, TkRule::Redir);
						break
					}

          if let Some('>') = chars.peek() {
            chars.next();
            pos += 1;
          }
          let Some('&') = chars.peek() else {
            tk = self.get_token(self.cursor..pos, TkRule::Redir);
            break;
          };

					chars.next();
					pos += 1;

					let mut found_fd = false;
					if chars.peek().is_some_and(|ch| *ch == '-') {
						chars.next();
						found_fd = true;
						pos += 1;
					} else {
						while chars.peek().is_some_and(|ch| ch.is_ascii_digit()) {
							chars.next();
							found_fd = true;
							pos += 1;
						}
					}

					if !found_fd && !self.flags.contains(LexFlags::LEX_UNFINISHED) {
						let span_start = self.cursor;
						self.cursor = pos;
						return Some(Err(ShErr::at(
							ShErrKind::ParseErr,
							Span::new(span_start..pos, self.source.clone()),
							"Invalid redirection",
						)));
					} else {
						tk = self.get_token(self.cursor..pos, TkRule::Redir);
						break;
					}
        }
        '<' => {
          if chars.peek() == Some(&'(') {
            return None; // It's a process sub
          }
          pos += 1;

					match chars.peek() {
						Some('<') => {
							chars.next();
							pos += 1;

							match chars.peek() {
								Some('<') => {
									chars.next();
									pos += 1;
								}

								Some(ch) => {
									let mut ch = *ch;
									while is_field_sep(ch) {
										let Some(next_ch) = chars.next() else {
											// Incomplete input — fall through to emit << as Redir
											break;
										};
										pos += next_ch.len_utf8();
										ch = next_ch;
									}

									if is_field_sep(ch) {
										// Ran out of input while skipping whitespace — fall through
									} else {
										let saved_cursor = self.cursor;
										match self.read_heredoc(pos) {
											Ok(Some(heredoc_tk)) => {
												// cursor is set to after the delimiter word;
												// heredoc_skip is set to after the body
												pos = self.cursor;
												self.cursor = saved_cursor;
												tk = heredoc_tk;
												break;
											}
											Ok(None) => {
												// Incomplete heredoc — restore cursor and fall through
												self.cursor = saved_cursor;
											}
											Err(e) => return Some(Err(e)),
										}
									}
								}
								_ => {
									// No delimiter yet — input is incomplete
									// Fall through to emit the << as a Redir token
								}
							}
						}
						Some('>') => {
							chars.next();
							pos += 1;
							tk = self.get_token(self.cursor..pos, TkRule::Redir);
							break;
						}
						Some('&') => {
							chars.next();
							pos += 1;

							let mut found_fd = false;
							if chars.peek().is_some_and(|ch| *ch == '-') {
								chars.next();
								found_fd = true;
								pos += 1;
							} else {
								while chars.peek().is_some_and(|ch| ch.is_ascii_digit()) {
									chars.next();
									found_fd = true;
									pos += 1;
								}
							}

							if !found_fd && !self.flags.contains(LexFlags::LEX_UNFINISHED) {
								let span_start = self.cursor;
								self.cursor = pos;
								return Some(Err(ShErr::at(
											ShErrKind::ParseErr,
											Span::new(span_start..pos, self.source.clone()),
											"Invalid redirection",
								)));
							} else {
								tk = self.get_token(self.cursor..pos, TkRule::Redir);
								break;
							}
						}
						_ => {}
					}

          tk = self.get_token(self.cursor..pos, TkRule::Redir);
          break;
        }
        '0'..='9' => {
          pos += 1;
          while chars.peek().is_some_and(|ch| ch.is_ascii_digit()) {
            chars.next();
            pos += 1;
          }
        }
        _ => {
          return None;
        }
      }
    }

    if tk == Tk::default() {
      return None;
    }

    self.cursor = pos;
    Some(Ok(tk))
  }
	pub fn read_heredoc(&mut self, mut pos: usize) -> ShResult<Option<Tk>> {
		let slice = self.slice(pos..).unwrap_or_default().to_string();
		let mut chars = slice.chars();
		let mut delim = String::new();
		let mut flags = TkFlags::empty();
		let mut first_char = true;
		// Parse the delimiter word, stripping quotes
		while let Some(ch) = chars.next() {
			match ch {
				'-' if first_char => {
					pos += 1;
					flags |= TkFlags::TAB_HEREDOC;
				}
				'\"' => {
					pos += 1;
					self.quote_state.toggle_double();
					flags |= TkFlags::LIT_HEREDOC;
				}
				'\'' => {
					pos += 1;
					self.quote_state.toggle_single();
					flags |= TkFlags::LIT_HEREDOC;
				}
				_ if self.quote_state.in_quote() => {
					pos += ch.len_utf8();
					delim.push(ch);
				}
				ch if is_hard_sep(ch) => {
					break;
				}
				ch => {
					pos += ch.len_utf8();
					delim.push(ch);
				}
			}
			first_char = false;
		}

		// pos is now right after the delimiter word — this is where
		// the cursor should return so the rest of the line gets lexed
		let cursor_after_delim = pos;

		// Re-slice from cursor_after_delim so iterator and pos are in sync
		// (the old chars iterator consumed the hard_sep without advancing pos)
		let rest = self.slice(cursor_after_delim..).unwrap_or_default().to_string();
		let mut chars = rest.chars();

		// Scan forward to the newline (or use heredoc_skip from a previous heredoc)
		let body_start = if let Some(skip) = self.heredoc_skip {
			// A previous heredoc on this line already read its body;
			// our body starts where that one ended
			let skip_offset = skip - cursor_after_delim;
			for _ in 0..skip_offset {
				chars.next();
			}
			skip
		} else {
			// Skip the rest of the current line to find where the body begins
			let mut scan = pos;
			let mut found_newline = false;
			while let Some(ch) = chars.next() {
				scan += ch.len_utf8();
				if ch == '\n' {
					found_newline = true;
					break;
				}
			}
			if !found_newline {
				if self.flags.contains(LexFlags::LEX_UNFINISHED) {
					return Ok(None);
				} else {
					return Err(ShErr::at(
						ShErrKind::ParseErr,
						Span::new(pos..pos, self.source.clone()),
						"Heredoc delimiter not found",
					));
				}
			}
			scan
		};

		pos = body_start;
		let start = pos;

		// Read lines until we find one that matches the delimiter exactly
		let mut line = String::new();
		let mut line_start = pos;
		while let Some(ch) = chars.next() {
			pos += ch.len_utf8();
			if ch == '\n' {
				let trimmed = line.trim_end_matches('\r');
				if trimmed == delim {
					let mut tk = self.get_token(start..line_start, TkRule::Redir);
					tk.flags |= TkFlags::IS_HEREDOC | flags;
					self.heredoc_skip = Some(pos);
					self.cursor = cursor_after_delim;
					return Ok(Some(tk));
				}
				line.clear();
				line_start = pos;
			} else {
				line.push(ch);
			}
		}
		// Check the last line (no trailing newline)
		let trimmed = line.trim_end_matches('\r');
		if trimmed == delim {
			let mut tk = self.get_token(start..line_start, TkRule::Redir);
			tk.flags |= TkFlags::IS_HEREDOC | flags;
			self.heredoc_skip = Some(pos);
			self.cursor = cursor_after_delim;
			return Ok(Some(tk));
		}

		if !self.flags.contains(LexFlags::LEX_UNFINISHED) {
			Err(ShErr::at(
				ShErrKind::ParseErr,
				Span::new(start..pos, self.source.clone()),
				format!("Heredoc delimiter '{}' not found", delim),
			))
		} else {
			Ok(None)
		}
	}
  pub fn read_string(&mut self) -> ShResult<Tk> {
    assert!(self.cursor <= self.source.len());
    let slice = self.slice_from_cursor().unwrap().to_string();
    let mut pos = self.cursor;
    let mut chars = slice.chars().peekable();
    let can_be_subshell = chars.peek() == Some(&'(');

    if self.case_depth > 0
      && let Some(count) = case_pat_lookahead(chars.clone())
    {
      pos += count;
      let casepat_tk = self.get_token(self.cursor..pos, TkRule::CasePattern);
      self.cursor = pos;
      self.set_next_is_cmd(true);
      return Ok(casepat_tk);
    }

    while let Some(ch) = chars.next() {
      match ch {
        _ if self.flags.contains(LexFlags::RAW) => {
          if ch.is_whitespace() {
            break;
          } else {
            pos += ch.len_utf8()
          }
        }
        '\\' => {
          pos += 1;
          if let Some(ch) = chars.next() {
            pos += ch.len_utf8();
          }
        }
        '\'' => {
          pos += 1;
          self.quote_state.toggle_single();
        }
        _ if self.quote_state.in_single() => pos += ch.len_utf8(),
        '$' if chars.peek() == Some(&'(') => {
          pos += 2;
          chars.next();
          let mut paren_count = 1;
          let paren_pos = pos;
          while let Some(ch) = chars.next() {
            match ch {
              '\\' => {
                pos += 1;
                if let Some(next_ch) = chars.next() {
                  pos += next_ch.len_utf8();
                }
              }
              '(' => {
                pos += 1;
                paren_count += 1;
              }
              ')' => {
                pos += 1;
                paren_count -= 1;
                if paren_count <= 0 {
                  break;
                }
              }
              _ => pos += ch.len_utf8(),
            }
          }
          if !paren_count == 0 && !self.flags.contains(LexFlags::LEX_UNFINISHED) {
            self.cursor = pos;
            return Err(ShErr::at(
              ShErrKind::ParseErr,
              Span::new(paren_pos..paren_pos + 1, self.source.clone()),
              "Unclosed subshell",
            ));
          }
        }
        '$' if chars.peek() == Some(&'{') => {
          pos += 2;
          chars.next();
          let mut brace_count = 1;
          while let Some(brc_ch) = chars.next() {
            match brc_ch {
              '\\' => {
                pos += 1;
                if let Some(next_ch) = chars.next() {
                  pos += next_ch.len_utf8()
                }
              }
              '{' => {
                pos += 1;
                brace_count += 1;
              }
              '}' => {
                pos += 1;
                brace_count -= 1;
                if brace_count == 0 {
                  break;
                }
              }
              _ => pos += ch.len_utf8(),
            }
          }
        }
        '"' => {
          pos += 1;
          self.quote_state.toggle_double();
        }
        _ if self.quote_state.in_double() => pos += ch.len_utf8(),
        '<' if chars.peek() == Some(&'(') => {
          pos += 2;
          chars.next();
          let mut paren_count = 1;
          let paren_pos = pos;
          while let Some(ch) = chars.next() {
            match ch {
              '\\' => {
                pos += 1;
                if let Some(next_ch) = chars.next() {
                  pos += next_ch.len_utf8();
                }
              }
              '(' => {
                pos += 1;
                paren_count += 1;
              }
              ')' => {
                pos += 1;
                paren_count -= 1;
                if paren_count <= 0 {
                  break;
                }
              }
              _ => pos += ch.len_utf8(),
            }
          }
          if !paren_count == 0 && !self.flags.contains(LexFlags::LEX_UNFINISHED) {
            self.cursor = pos;
            return Err(ShErr::at(
              ShErrKind::ParseErr,
              Span::new(paren_pos..paren_pos + 1, self.source.clone()),
              "Unclosed subshell",
            ));
          }
        }
        '>' if chars.peek() == Some(&'(') => {
          pos += 2;
          chars.next();
          let mut paren_count = 1;
          let paren_pos = pos;
          while let Some(ch) = chars.next() {
            match ch {
              '\\' => {
                pos += 1;
                if let Some(next_ch) = chars.next() {
                  pos += next_ch.len_utf8();
                }
              }
              '(' => {
                pos += 1;
                paren_count += 1;
              }
              ')' => {
                pos += 1;
                paren_count -= 1;
                if paren_count <= 0 {
                  break;
                }
              }
              _ => pos += ch.len_utf8(),
            }
          }
          if !paren_count == 0 && !self.flags.contains(LexFlags::LEX_UNFINISHED) {
            self.cursor = pos;
            return Err(ShErr::at(
              ShErrKind::ParseErr,
              Span::new(paren_pos..paren_pos + 1, self.source.clone()),
              "Unclosed subshell",
            ));
          }
        }
        '(' if self.next_is_cmd() && can_be_subshell => {
          pos += 1;
          let mut paren_count = 1;
          let paren_pos = pos;
          while let Some(ch) = chars.next() {
            match ch {
              '\\' => {
                pos += 1;
                if let Some(next_ch) = chars.next() {
                  pos += next_ch.len_utf8();
                }
              }
              '(' => {
                pos += 1;
                paren_count += 1;
              }
              ')' => {
                pos += 1;
                paren_count -= 1;
                if paren_count <= 0 {
                  break;
                }
              }
              _ => pos += ch.len_utf8(),
            }
          }
          if paren_count != 0 && !self.flags.contains(LexFlags::LEX_UNFINISHED) {
            self.cursor = pos;
            return Err(ShErr::at(
              ShErrKind::ParseErr,
              Span::new(paren_pos..paren_pos + 1, self.source.clone()),
              "Unclosed subshell",
            ));
          }
          let mut subsh_tk = self.get_token(self.cursor..pos, TkRule::Str);
          subsh_tk.flags |= TkFlags::IS_CMD;
          subsh_tk.flags |= TkFlags::IS_SUBSH;
          self.cursor = pos;
          self.set_next_is_cmd(true);
          return Ok(subsh_tk);
        }
        '{' if pos == self.cursor && self.next_is_cmd() => {
          pos += 1;
          let mut tk = self.get_token(self.cursor..pos, TkRule::BraceGrpStart);
          tk.flags |= TkFlags::IS_CMD;
          self.enter_brc_grp();
          self.set_next_is_cmd(true);

          self.cursor = pos;
          return Ok(tk);
        }
        '}' if pos == self.cursor && self.in_brc_grp() => {
          pos += 1;
          let tk = self.get_token(self.cursor..pos, TkRule::BraceGrpEnd);
          self.leave_brc_grp();
          self.set_next_is_cmd(true);
          self.cursor = pos;
          return Ok(tk);
        }
        '=' if chars.peek() == Some(&'(') => {
          pos += 1; // '='
          let mut depth = 1;
          chars.next();
          pos += 1; // '('
          // looks like an array
          while let Some(arr_ch) = chars.next() {
            match arr_ch {
              '\\' => {
                pos += 1;
                if let Some(next_ch) = chars.next() {
                  pos += next_ch.len_utf8();
                }
              }
              '(' => {
                depth += 1;
                pos += 1;
              }
              ')' => {
                depth -= 1;
                pos += 1;
                if depth == 0 {
                  break;
                }
              }
              _ => pos += arr_ch.len_utf8(),
            }
          }
        }
        _ if is_hard_sep(ch) => break,
        _ => pos += ch.len_utf8(),
      }
    }
    let mut new_tk = self.get_token(self.cursor..pos, TkRule::Str);
    if self.quote_state.in_quote() && !self.flags.contains(LexFlags::LEX_UNFINISHED) {
      self.cursor = pos;
      return Err(ShErr::at(
        ShErrKind::ParseErr,
        new_tk.span,
        "Unterminated quote",
      ));
    }

    let text = new_tk.span.as_str();
    if self.flags.contains(LexFlags::NEXT_IS_CMD) {
      match text {
        "case" | "select" | "for" => {
          new_tk.mark(TkFlags::KEYWORD);
          self.flags |= LexFlags::EXPECTING_IN;
          self.case_depth += 1;
          self.set_next_is_cmd(false);
        }
        "in" if self.flags.contains(LexFlags::EXPECTING_IN) => {
          new_tk.mark(TkFlags::KEYWORD);
          self.flags &= !LexFlags::EXPECTING_IN;
        }
        _ if is_keyword(text) => {
          if text == "esac" && self.case_depth > 0 {
            self.case_depth -= 1;
          }
          new_tk.mark(TkFlags::KEYWORD);
        }
        _ if is_assignment(text) => {
          new_tk.mark(TkFlags::ASSIGN);
        }
        _ if is_cmd_sub(text) => {
          new_tk.mark(TkFlags::IS_CMDSUB);
          if self.next_is_cmd() {
            new_tk.mark(TkFlags::IS_CMD);
          }
          self.set_next_is_cmd(false);
        }
        _ => {
          new_tk.flags |= TkFlags::IS_CMD;
          if BUILTINS.contains(&text) {
            new_tk.mark(TkFlags::BUILTIN);
          }
          self.set_next_is_cmd(false);
        }
      }
    } else if self.flags.contains(LexFlags::EXPECTING_IN) && text == "in" {
      new_tk.mark(TkFlags::KEYWORD);
      self.flags &= !LexFlags::EXPECTING_IN;
    } else if is_cmd_sub(text) {
      new_tk.mark(TkFlags::IS_CMDSUB)
    }
    self.cursor = pos;
    Ok(new_tk)
  }
  pub fn get_token(&self, range: Range<usize>, class: TkRule) -> Tk {
    let mut span = Span::new(range, self.source.clone());
    span.rename(self.name.clone());
    Tk::new(class, span)
  }
}

impl Iterator for LexStream {
  type Item = ShResult<Tk>;
  fn next(&mut self) -> Option<Self::Item> {
    assert!(self.cursor <= self.source.len());
    // We are at the end of the input
    if self.cursor == self.source.len() {
      if self.flags.contains(LexFlags::STALE) {
        // We've already returned an EOI token, nothing left to do
        return None;
      } else {
        // Return the EOI token
        if self.in_brc_grp() && !self.flags.contains(LexFlags::LEX_UNFINISHED) {
          let start = self.brc_grp_start.unwrap_or(self.cursor.saturating_sub(1));
          self.flags |= LexFlags::STALE;
          return Err(ShErr::at(
            ShErrKind::ParseErr,
            Span::new(start..self.cursor, self.source.clone()),
            "Unclosed brace group",
          ))
          .into();
        }
        let token = self.get_token(self.cursor..self.cursor, TkRule::EOI);
        self.flags |= LexFlags::STALE;
        return Some(Ok(token));
      }
    }
    // Return the SOI token
    if self.flags.contains(LexFlags::FRESH) {
      self.flags &= !LexFlags::FRESH;
      let token = self.get_token(self.cursor..self.cursor, TkRule::SOI);
      return Some(Ok(token));
    }

    // If we are just reading raw words, short circuit here
    // Used for word splitting variable values
    if self.flags.contains(LexFlags::RAW) {
      return Some(self.read_string());
    }

    loop {
      let pos = self.cursor;
      if self.slice(pos..pos + 2) == Some("\\\n") {
        self.cursor += 2;
      } else if pos < self.source.len() && is_field_sep(get_char(&self.source, pos).unwrap()) {
        self.cursor += 1;
      } else {
        break;
      }
    }

    if self.cursor == self.source.len() {
      if self.in_brc_grp() && !self.flags.contains(LexFlags::LEX_UNFINISHED) {
        let start = self.brc_grp_start.unwrap_or(self.cursor.saturating_sub(1));
        return Err(ShErr::at(
          ShErrKind::ParseErr,
          Span::new(start..self.cursor, self.source.clone()),
          "Unclosed brace group",
        ))
        .into();
      }
      return None;
    }

    let token = match get_char(&self.source, self.cursor).unwrap() {
      '\r' | '\n' | ';' => {
        let ch = get_char(&self.source, self.cursor).unwrap();
        let ch_idx = self.cursor;
        self.cursor += 1;
        self.set_next_is_cmd(true);

        // If a heredoc was parsed on this line, skip past the body
        // Only on newline — ';' is a command separator within the same line
        if (ch == '\n' || ch == '\r')
				&& let Some(skip) = self.heredoc_skip.take() {
					self.cursor = skip;
				}

        while let Some(ch) = get_char(&self.source, self.cursor) {
          match ch {
            '\\' if get_char(&self.source, self.cursor + 1) == Some('\n') => {
              self.cursor = (self.cursor + 2).min(self.source.len());
            }
            _ if is_hard_sep(ch) => {
              self.cursor += 1;
            }
            _ => break,
          }
        }
        self.get_token(ch_idx..self.cursor, TkRule::Sep)
      }
      '#'
        if !self.flags.contains(LexFlags::INTERACTIVE)
          || crate::state::read_shopts(|s| s.core.interactive_comments) =>
      {
        let ch_idx = self.cursor;
        self.cursor += 1;

        while let Some(ch) = get_char(&self.source, self.cursor) {
          self.cursor += ch.len_utf8();
          if ch == '\n' {
            break;
          }
        }

        if self.flags.contains(LexFlags::LEX_UNFINISHED) {
          self.get_token(ch_idx..self.cursor, TkRule::Comment)
        } else {
          // After consuming the comment, we call next() recursively. This effectively filters out comment tokens.
          return self.next();
        }
      }
      '!' if self.next_is_cmd() => {
        self.cursor += 1;
        let tk_type = TkRule::Bang;

        let mut tk = self.get_token((self.cursor - 1)..self.cursor, tk_type);
        tk.flags |= TkFlags::KEYWORD;
        tk
      }
      '|' => {
        let ch_idx = self.cursor;
        self.cursor += 1;
        self.set_next_is_cmd(true);

        let tk_type = if let Some('|') = get_char(&self.source, self.cursor) {
          self.cursor += 1;
          TkRule::Or
        } else if let Some('&') = get_char(&self.source, self.cursor) {
          self.cursor += 1;
          TkRule::ErrPipe
        } else {
          TkRule::Pipe
        };

        self.get_token(ch_idx..self.cursor, tk_type)
      }
      '&' => {
        let ch_idx = self.cursor;
        self.cursor += 1;
        self.set_next_is_cmd(true);

        let tk_type = if let Some('&') = get_char(&self.source, self.cursor) {
          self.cursor += 1;
          TkRule::And
        } else {
          TkRule::Bg
        };
        self.get_token(ch_idx..self.cursor, tk_type)
      }
      _ => {
        if let Some(tk) = self.read_redir() {
          self.set_next_is_cmd(false);
          match tk {
            Ok(tk) => tk,
            Err(e) => return Some(Err(e)),
          }
        } else {
          match self.read_string() {
            Ok(tk) => tk,
            Err(e) => {
              return Some(Err(e));
            }
          }
        }
      }
    };
    Some(Ok(token))
  }
}

pub fn get_char(src: &str, idx: usize) -> Option<char> {
  src.get(idx..)?.chars().next()
}

pub fn is_assignment(text: &str) -> bool {
  let mut chars = text.chars();

  while let Some(ch) = chars.next() {
    match ch {
      '\\' => {
        chars.next();
      }
      '=' => return true,
      _ => continue,
    }
  }
  false
}

/// Is '|', '&', '>', or '<'
pub fn is_op(ch: char) -> bool {
  matches!(ch, '|' | '&' | '>' | '<')
}

/// Is whitespace or a semicolon
pub fn is_hard_sep(ch: char) -> bool {
  matches!(ch, ' ' | '\t' | '\n' | ';')
}

/// Is whitespace, but not a newline
pub fn is_field_sep(ch: char) -> bool {
  matches!(ch, ' ' | '\t')
}

pub fn is_keyword(slice: &str) -> bool {
  KEYWORDS.contains(&slice)
    || (ends_with_unescaped(slice, "()") && !ends_with_unescaped(slice, "=()"))
}

pub fn is_cmd_sub(slice: &str) -> bool {
  slice.starts_with("$(") && ends_with_unescaped(slice, ")")
}

pub fn ends_with_unescaped(slice: &str, pat: &str) -> bool {
  slice.ends_with(pat) && !pos_is_escaped(slice, slice.len() - pat.len())
}

/// Splits a string by a pattern, but only if the pattern is not escaped by a backslash
/// and not in quotes.
pub fn split_all_unescaped(slice: &str, pat: &str) -> Vec<String> {
  let mut cursor = 0;
  let mut splits = vec![];
  while let Some(split) = split_at_unescaped(&slice[cursor..], pat) {
    cursor += split.0.len() + pat.len();
    splits.push(split.0);
  }
  if let Some(remaining) = slice.get(cursor..) {
    splits.push(remaining.to_string());
  }
  splits
}

/// Splits a string at the first occurrence of a pattern, but only if the pattern is not escaped by a backslash
/// and not in quotes. Returns None if the pattern is not found or only found escaped.
pub fn split_at_unescaped(slice: &str, pat: &str) -> Option<(String, String)> {
  let mut chars = slice.char_indices().peekable();
  let mut qt_state = QuoteState::default();

  while let Some((i, ch)) = chars.next() {
    match ch {
      '\\' => {
        chars.next();
        continue;
      }
      '\'' => qt_state.toggle_single(),
      '"' => qt_state.toggle_double(),
      _ if qt_state.in_quote() => continue,
      _ => {}
    }

    if slice[i..].starts_with(pat) {
      let before = slice[..i].to_string();
      let after = slice[i + pat.len()..].to_string();
      return Some((before, after));
    }
  }

  None
}

pub fn split_tk(tk: &Tk, pat: &str) -> Vec<Tk> {
  let slice = tk.as_str();
  let mut cursor = 0;
  let mut splits = vec![];
  while let Some(split) = split_at_unescaped(&slice[cursor..], pat) {
    let before_span = Span::new(
      tk.span.range().start + cursor..tk.span.range().start + cursor + split.0.len(),
      tk.source().clone(),
    );
    splits.push(Tk::new(tk.class.clone(), before_span));
    cursor += split.0.len() + pat.len();
  }
  if slice.get(cursor..).is_some_and(|s| !s.is_empty()) {
    let remaining_span = Span::new(
      tk.span.range().start + cursor..tk.span.range().end,
      tk.source().clone(),
    );
    splits.push(Tk::new(tk.class.clone(), remaining_span));
  }
  splits
}

pub fn split_tk_at(tk: &Tk, pat: &str) -> Option<(Tk, Tk)> {
  let slice = tk.as_str();
  let mut chars = slice.char_indices().peekable();
  let mut qt_state = QuoteState::default();

  while let Some((i, ch)) = chars.next() {
    match ch {
      '\\' => {
        chars.next();
        continue;
      }
      '\'' => qt_state.toggle_single(),
      '"' => qt_state.toggle_double(),
      _ if qt_state.in_quote() => continue,
      _ => {}
    }

    if slice[i..].starts_with(pat) {
      let before_span = Span::new(
        tk.span.range().start..tk.span.range().start + i,
        tk.source().clone(),
      );
      let after_span = Span::new(
        tk.span.range().start + i + pat.len()..tk.span.range().end,
        tk.source().clone(),
      );
      let before_tk = Tk::new(tk.class.clone(), before_span);
      let after_tk = Tk::new(tk.class.clone(), after_span);
      return Some((before_tk, after_tk));
    }
  }

  None
}

pub fn pos_is_escaped(slice: &str, pos: usize) -> bool {
  let bytes = slice.as_bytes();
  let mut escaped = false;
  let mut i = pos;
  while i > 0 && bytes[i - 1] == b'\\' {
    escaped = !escaped;
    i -= 1;
  }
  escaped
}

pub fn lookahead(pat: &str, mut chars: Chars) -> Option<usize> {
  let mut pos = 0;
  let mut char_deque = VecDeque::new();
  while let Some(ch) = chars.next() {
    char_deque.push_back(ch);
    if char_deque.len() > pat.len() {
      char_deque.pop_front();
    }
    if char_deque.starts_with(pat) {
      return Some(pos);
    }
    pos += 1;
  }
  None
}

pub fn case_pat_lookahead(mut chars: Peekable<Chars>) -> Option<usize> {
  let mut pos = 0;
  let mut qt_state = QuoteState::default();
  while let Some(ch) = chars.next() {
    pos += ch.len_utf8();
    match ch {
      _ if qt_state.outside() && is_hard_sep(ch) => return None,
      '\\' => {
        if let Some(esc) = chars.next() {
          pos += esc.len_utf8();
        }
      }
      '$' if qt_state.outside() && chars.peek() == Some(&'\'') => {
        // $'...' ANSI-C quoting — skip through to closing quote
        chars.next(); // consume opening '
        pos += 1;
        while let Some(c) = chars.next() {
          pos += c.len_utf8();
          if c == '\\' {
            if let Some(esc) = chars.next() {
              pos += esc.len_utf8();
            }
          } else if c == '\'' {
            break;
          }
        }
      }
      '\'' => {
        qt_state.toggle_single();
      }
      '"' => {
        qt_state.toggle_double();
      }
      ')' if qt_state.outside() => return Some(pos),
      '(' if qt_state.outside() => return None,
      _ => { /* continue */ }
    }
  }
  None
}