more highlighter tune-ups

This commit is contained in:
2026-02-18 16:29:40 -05:00
parent af70266f6a
commit 01684cf8e5
2 changed files with 176 additions and 14 deletions

View File

@@ -2,6 +2,12 @@ use std::{env, path::{Path, PathBuf}};
use crate::{libsh::term::{Style, StyleSet, Styled}, prompt::readline::{annotate_input, markers}, state::read_logic}; use crate::{libsh::term::{Style, StyleSet, Styled}, prompt::readline::{annotate_input, markers}, state::read_logic};
/// Syntax highlighter for shell input using Unicode marker-based annotation
///
/// The highlighter processes annotated input strings containing invisible Unicode markers
/// (U+FDD0-U+FDEF range) that indicate syntax elements. It generates ANSI escape codes
/// for terminal display while maintaining a style stack for proper color restoration
/// in nested constructs (e.g., variables inside strings inside command substitutions).
pub struct Highlighter { pub struct Highlighter {
input: String, input: String,
output: String, output: String,
@@ -10,6 +16,7 @@ pub struct Highlighter {
} }
impl Highlighter { impl Highlighter {
/// Creates a new highlighter with empty buffers and reset state
pub fn new() -> Self { pub fn new() -> Self {
Self { Self {
input: String::new(), input: String::new(),
@@ -19,11 +26,20 @@ impl Highlighter {
} }
} }
/// Loads raw input text and annotates it with syntax markers
///
/// The input is passed through the annotator which inserts Unicode markers
/// indicating token types and sub-token constructs (strings, variables, etc.)
pub fn load_input(&mut self, input: &str) { pub fn load_input(&mut self, input: &str) {
let input = annotate_input(input); let input = annotate_input(input);
self.input = input; self.input = input;
} }
/// Processes the annotated input and generates ANSI-styled output
///
/// Walks through the input character by character, interpreting markers and
/// applying appropriate styles. Nested constructs (command substitutions,
/// subshells, strings) are handled recursively with proper style restoration.
pub fn highlight(&mut self) { pub fn highlight(&mut self) {
let input = self.input.clone(); let input = self.input.clone();
let mut input_chars = input.chars().peekable(); let mut input_chars = input.chars().peekable();
@@ -156,6 +172,10 @@ impl Highlighter {
if *ch == markers::VAR_SUB_END { if *ch == markers::VAR_SUB_END {
input_chars.next(); // consume the end marker input_chars.next(); // consume the end marker
break; break;
} else if markers::is_marker(*ch) {
log::warn!("Unhandled marker character in variable substitution: U+{:04X}", *ch as u32);
input_chars.next(); // skip the marker
continue;
} }
var_sub.push(*ch); var_sub.push(*ch);
input_chars.next(); input_chars.next();
@@ -166,13 +186,21 @@ impl Highlighter {
self.pop_style(); self.pop_style();
} }
_ => { _ => {
self.output.push(ch); if markers::is_marker(ch) {
self.last_was_reset = false; log::warn!("Unhandled marker character in highlighter: U+{:04X}", ch as u32);
} else {
self.output.push(ch);
self.last_was_reset = false;
}
} }
} }
} }
} }
/// Extracts the highlighted output and resets the highlighter state
///
/// Clears the input buffer, style stack, and returns the generated output
/// containing ANSI escape codes. The highlighter is ready for reuse after this.
pub fn take(&mut self) -> String { pub fn take(&mut self) -> String {
log::info!("Highlighting result: {:?}", self.output); log::info!("Highlighting result: {:?}", self.output);
self.input.clear(); self.input.clear();
@@ -180,6 +208,12 @@ impl Highlighter {
std::mem::take(&mut self.output) std::mem::take(&mut self.output)
} }
/// Checks if a command name is valid (exists in PATH, is a function, or is an alias)
///
/// Searches:
/// 1. Current directory if command is a path
/// 2. All directories in PATH environment variable
/// 3. Shell functions and aliases in the current shell state
fn is_valid(command: &str) -> bool { fn is_valid(command: &str) -> bool {
let path = env::var("PATH").unwrap_or_default(); let path = env::var("PATH").unwrap_or_default();
let paths = path.split(':'); let paths = path.split(':');
@@ -202,6 +236,10 @@ impl Highlighter {
false false
} }
/// Emits a reset ANSI code to the output, with deduplication
///
/// Only emits the reset if the last emitted code was not already a reset,
/// preventing redundant `\x1b[0m` sequences in the output.
fn emit_reset(&mut self) { fn emit_reset(&mut self) {
if !self.last_was_reset { if !self.last_was_reset {
self.output.push_str(&Style::Reset.to_string()); self.output.push_str(&Style::Reset.to_string());
@@ -209,17 +247,31 @@ impl Highlighter {
} }
} }
/// Emits a style ANSI code to the output
///
/// Unconditionally appends the ANSI escape sequence for the given style
/// and marks that we're no longer in a reset state.
fn emit_style(&mut self, style: &StyleSet) { fn emit_style(&mut self, style: &StyleSet) {
self.output.push_str(&style.to_string()); self.output.push_str(&style.to_string());
self.last_was_reset = false; self.last_was_reset = false;
} }
/// Pushes a new style onto the stack and emits its ANSI code
///
/// Used when entering a new syntax context (string, variable, command, etc.).
/// The style stack allows proper restoration when exiting nested constructs.
pub fn push_style(&mut self, style: impl Into<StyleSet>) { pub fn push_style(&mut self, style: impl Into<StyleSet>) {
let set: StyleSet = style.into(); let set: StyleSet = style.into();
self.style_stack.push(set.clone()); self.style_stack.push(set.clone());
self.emit_style(&set); self.emit_style(&set);
} }
/// Pops a style from the stack and restores the previous style
///
/// Used when exiting a syntax context. If there's a parent style on the stack,
/// it's re-emitted to restore the previous color. Otherwise, emits a reset.
/// This ensures colors are properly restored in nested constructs like
/// `"string with $VAR"` where the string color resumes after the variable.
pub fn pop_style(&mut self) { pub fn pop_style(&mut self) {
self.style_stack.pop(); self.style_stack.pop();
if let Some(style) = self.style_stack.last().cloned() { if let Some(style) = self.style_stack.last().cloned() {
@@ -229,11 +281,20 @@ impl Highlighter {
} }
} }
/// Clears all styles from the stack and emits a reset
///
/// Used at command separators and explicit reset markers to return to
/// the default terminal color between independent commands.
pub fn clear_styles(&mut self) { pub fn clear_styles(&mut self) {
self.style_stack.clear(); self.style_stack.clear();
self.emit_reset(); self.emit_reset();
} }
/// Simple marker-to-ANSI replacement (unused in favor of stack-based highlighting)
///
/// Performs direct string replacement of markers with ANSI codes, without
/// handling nesting or proper color restoration. Kept for reference but not
/// used in the current implementation.
pub fn trivial_replace(&mut self) { pub fn trivial_replace(&mut self) {
self.input = self.input self.input = self.input
.replace([markers::RESET, markers::ARG], "\x1b[0m") .replace([markers::RESET, markers::ARG], "\x1b[0m")

View File

@@ -62,6 +62,30 @@ pub mod markers {
SUBSH_END, SUBSH_END,
RESET RESET
]; ];
pub const TOKEN_LEVEL: [char;10] = [
SUBSH,
COMMAND,
BUILTIN,
ARG,
KEYWORD,
OPERATOR,
REDIRECT,
CMD_SEP,
CASE_PAT,
ASSIGNMENT,
];
pub const SUB_TOKEN: [char;6] = [
VAR_SUB,
CMD_SUB,
PROC_SUB,
STRING_DQ,
STRING_SQ,
GLOB,
];
pub fn is_marker(c: char) -> bool {
TOKEN_LEVEL.contains(&c) || SUB_TOKEN.contains(&c) || END_MARKERS.contains(&c)
}
} }
/// Non-blocking readline result /// Non-blocking readline result
@@ -219,7 +243,7 @@ impl FernVi {
log::debug!("{line:?}"); log::debug!("{line:?}");
let to_cursor = self.editor.slice_to_cursor().unwrap_or_default(); let to_cursor = self.editor.slice_to_cursor().unwrap_or_default();
let (cols, _) = get_win_size(STDIN_FILENO); let (cols, _) = get_win_size(STDIN_FILENO);
Layout::from_parts(/* tab_stop: */ 8, cols, &self.prompt, to_cursor, &line) Layout::from_parts(/* tab_stop: */ 8, cols, &self.prompt, to_cursor, line)
} }
pub fn scroll_history(&mut self, cmd: ViCmd) { pub fn scroll_history(&mut self, cmd: ViCmd) {
log::debug!("scrolling"); log::debug!("scrolling");
@@ -305,12 +329,16 @@ impl FernVi {
} }
pub fn line_text(&mut self) -> String { pub fn line_text(&mut self) -> String {
let start = Instant::now();
let line = self.editor.to_string(); let line = self.editor.to_string();
self.highlighter.load_input(&line); self.highlighter.load_input(&line);
self.highlighter.highlight(); self.highlighter.highlight();
let highlighted = self.highlighter.take(); let highlighted = self.highlighter.take();
let hint = self.editor.get_hint_text(); let hint = self.editor.get_hint_text();
format!("{highlighted}{hint}") let complete = format!("{highlighted}{hint}");
let end = start.elapsed();
log::info!("Line styling done in: {:.2?}", end);
complete
} }
pub fn print_line(&mut self) -> ShResult<()> { pub fn print_line(&mut self) -> ShResult<()> {
@@ -486,8 +514,25 @@ impl FernVi {
} }
} }
/// Annotate a given input with helpful markers that give quick contextual syntax information /// Annotates shell input with invisible Unicode markers for syntax highlighting
/// Useful for syntax highlighting and completion ///
/// Takes raw shell input and inserts non-character markers (U+FDD0-U+FDEF range)
/// around syntax elements. These markers indicate:
/// - Token-level context (commands, arguments, operators, keywords)
/// - Sub-token constructs (strings, variables, command substitutions, globs)
///
/// The annotated string is suitable for processing by the highlighter, which
/// interprets the markers and generates ANSI escape codes.
///
/// # Strategy
/// Tokens are processed in reverse order so that later insertions don't
/// invalidate earlier positions. Each token is annotated independently.
///
/// # Example
/// ```text
/// "echo $USER" -> "COMMAND echo RESET ARG VAR_SUB $USER VAR_SUB_END RESET"
/// ```
/// (where COMMAND, RESET, etc. are invisible Unicode markers)
pub fn annotate_input(input: &str) -> String { pub fn annotate_input(input: &str) -> String {
let mut annotated = input.to_string(); let mut annotated = input.to_string();
let input = Arc::new(input.to_string()); let input = Arc::new(input.to_string());
@@ -502,6 +547,16 @@ pub fn annotate_input(input: &str) -> String {
annotated annotated
} }
/// Maps token class to its corresponding marker character
///
/// Returns the appropriate Unicode marker for token-level syntax elements.
/// Token-level markers are derived directly from the lexer's token classification
/// and represent complete tokens (operators, separators, etc.).
///
/// Returns `None` for:
/// - String tokens (which need sub-token scanning for variables, quotes, etc.)
/// - Structural markers (SOI, EOI, Null)
/// - Unimplemented features (comments, brace groups)
pub fn marker_for(class: &TkRule) -> Option<char> { pub fn marker_for(class: &TkRule) -> Option<char> {
match class { match class {
TkRule::Pipe | TkRule::Pipe |
@@ -523,7 +578,48 @@ pub fn marker_for(class: &TkRule) -> Option<char> {
} }
} }
/// Annotates a single token with markers for both token-level and sub-token constructs
///
/// This is the core annotation function that handles the complexity of shell syntax.
/// It uses a two-phase approach:
///
/// # Phase 1: Analysis (Delayed Insertion)
/// Scans through the token character by character, recording marker insertions
/// as `(position, marker)` pairs in a list. This avoids borrowing issues and
/// allows context queries during the scan.
///
/// The analysis phase handles:
/// - **Strings**: Single/double quoted regions (with escaping rules)
/// - **Variables**: `$VAR` and `${VAR}` expansions
/// - **Command substitutions**: `$(...)` with depth tracking
/// - **Process substitutions**: `<(...)` and `>(...)`
/// - **Globs**: `*`, `?`, `[...]` patterns (context-aware)
/// - **Escapes**: Backslash escaping
///
/// # Phase 2: Application (Sorted Insertion)
/// Markers are sorted by position (descending) to avoid index invalidation when
/// inserting into the string. At the same position, markers are ordered:
/// 1. RESET (rightmost)
/// 2. Regular markers (middle)
/// 3. END markers (leftmost)
///
/// This produces the pattern: `[END][TOGGLE][RESET]` at boundaries.
///
/// # Context Tracking
/// The `in_context` closure queries the insertion list to determine the active
/// syntax context at the current position. This enables context-aware decisions
/// like "only highlight globs in arguments, not in command names".
///
/// # Depth Tracking
/// Nested constructs like `$(echo $(date))` are tracked with depth counters.
/// Only the outermost construct is marked; inner content is handled recursively
/// by the highlighter.
pub fn annotate_token(input: &mut String, token: Tk) { pub fn annotate_token(input: &mut String, token: Tk) {
// Sort by position descending, with priority ordering at same position:
// - RESET first (inserted first, ends up rightmost)
// - Regular markers middle
// - END markers last (inserted last, ends up leftmost)
// Result: [END][TOGGLE][RESET]
let sort_insertions = |insertions: &mut Vec<(usize, char)>| { let sort_insertions = |insertions: &mut Vec<(usize, char)>| {
insertions.sort_by(|a, b| { insertions.sort_by(|a, b| {
match b.0.cmp(&a.0) { match b.0.cmp(&a.0) {
@@ -531,12 +627,18 @@ pub fn annotate_token(input: &mut String, token: Tk) {
let priority = |m: char| -> u8 { let priority = |m: char| -> u8 {
match m { match m {
markers::RESET => 0, markers::RESET => 0,
markers::VAR_SUB |
markers::VAR_SUB_END | markers::VAR_SUB_END |
markers::CMD_SUB |
markers::CMD_SUB_END | markers::CMD_SUB_END |
markers::PROC_SUB |
markers::PROC_SUB_END | markers::PROC_SUB_END |
markers::STRING_DQ |
markers::STRING_DQ_END | markers::STRING_DQ_END |
markers::STRING_SQ |
markers::STRING_SQ_END | markers::STRING_SQ_END |
markers::SUBSH_END => 2, markers::SUBSH_END => 2,
markers::ARG => 3,
_ => 1, _ => 1,
} }
}; };
@@ -555,12 +657,18 @@ pub fn annotate_token(input: &mut String, token: Tk) {
let priority = |m: char| -> u8 { let priority = |m: char| -> u8 {
match m { match m {
markers::RESET => 0, markers::RESET => 0,
markers::VAR_SUB |
markers::VAR_SUB_END | markers::VAR_SUB_END |
markers::CMD_SUB |
markers::CMD_SUB_END | markers::CMD_SUB_END |
markers::PROC_SUB |
markers::PROC_SUB_END | markers::PROC_SUB_END |
markers::STRING_DQ |
markers::STRING_DQ_END | markers::STRING_DQ_END |
markers::STRING_SQ |
markers::STRING_SQ_END | markers::STRING_SQ_END |
markers::SUBSH_END => 2, markers::SUBSH_END => 2,
markers::ARG => 3, // Lowest priority - processed first, overridden by sub-tokens
_ => 1, _ => 1,
} }
}; };
@@ -571,12 +679,9 @@ pub fn annotate_token(input: &mut String, token: Tk) {
}); });
stack.retain(|(i, m)| *i <= token.span.start && !markers::END_MARKERS.contains(m)); stack.retain(|(i, m)| *i <= token.span.start && !markers::END_MARKERS.contains(m));
log::error!("Checking context for token '{}', looking for '{}'", token.span.as_str(), c);
let Some(ctx) = stack.last() else { let Some(ctx) = stack.last() else {
return false; return false;
}; };
log::error!("Context stack for token '{}': {:?}", token.span.as_str(), stack);
log::error!("Found context marker '{}' at position {}", ctx.1, ctx.0);
ctx.1 == c ctx.1 == c
}; };
@@ -788,14 +893,10 @@ pub fn annotate_token(input: &mut String, token: Tk) {
} }
} }
// Sort by position descending, with priority ordering at same position:
// - RESET first (inserted first, ends up rightmost)
// - Regular markers middle
// - END markers last (inserted last, ends up leftmost)
// Result: [END][TOGGLE][RESET]
sort_insertions(&mut insertions); sort_insertions(&mut insertions);
for (pos, marker) in insertions { for (pos, marker) in insertions {
log::info!("Inserting marker {marker:?} at position {pos}");
let pos = pos.max(0).min(input.len()); let pos = pos.max(0).min(input.len());
input.insert(pos, marker); input.insert(pos, marker);
} }