spade_parser/
lexer.rs

1use logos::Logos;
2
3use num::BigUint;
4
5#[derive(Debug, PartialEq, Clone)]
6pub enum LiteralKind {
7    Unsized,
8    Signed(BigUint),
9    Unsigned(BigUint),
10}
11
12fn parse_int(slice: &str, radix: u32) -> (BigUint, LiteralKind) {
13    let lower = slice.to_ascii_lowercase().replace(['_'], "");
14
15    let (cleaned, kind) = if lower.contains("u") {
16        let split = lower.split("u").collect::<Vec<_>>();
17        let kind = LiteralKind::Unsigned(BigUint::parse_bytes(split[1].as_bytes(), 10).unwrap());
18        (split[0], kind)
19    } else if lower.contains("i") {
20        let split = lower.split("i").collect::<Vec<_>>();
21        let kind = LiteralKind::Signed(BigUint::parse_bytes(split[1].as_bytes(), 10).unwrap());
22        (split[0], kind)
23    } else {
24        (lower.as_str(), LiteralKind::Unsized)
25    };
26
27    (
28        BigUint::parse_bytes(cleaned.as_bytes(), radix).unwrap(),
29        kind,
30    )
31}
32
33#[derive(Logos, Debug, PartialEq, Clone)]
34pub enum TokenKind {
35    // Unholy regex for unicode identifiers. Stolen from Repnop who stole it from Evrey
36    #[regex(r#"(?x:
37        [\p{XID_Start}_]
38        \p{XID_Continue}*
39        (\u{3F} | \u{21} | (\u{3F}\u{21}) | \u{2048})? # ? ! ?! ⁈
40    )"#, |lex| lex.slice().to_string())]
41    Identifier(String),
42
43    #[regex(r"[0-9][0-9_]*([uUiI][0-9]+)?", |lex| {
44        parse_int(lex.slice(), 10)
45    })]
46    Integer((BigUint, LiteralKind)),
47    #[regex(r"0x[0-9A-Fa-f][0-9_A-Fa-f]*([uUiI][0-9]+)?", |lex| {
48        parse_int(&lex.slice()[2..], 16)
49    })]
50    HexInteger((BigUint, LiteralKind)),
51    #[regex(r"0b[0-1][0-1_]*([uUiI][0-9]+)?", |lex| {
52        parse_int(&lex.slice()[2..], 2)
53    })]
54    BinInteger((BigUint, LiteralKind)),
55
56    #[token("true")]
57    True,
58    #[token("false")]
59    False,
60
61    #[token("LOW")]
62    Low,
63    #[token("HIGH")]
64    High,
65    #[token("HIGHIMP")]
66    HighImp,
67
68    // Keywords
69    #[token("reg")]
70    Reg,
71    #[token("let")]
72    Let,
73    #[token("decl")]
74    Decl,
75    #[token("inst")]
76    Instance,
77    #[token("reset")]
78    Reset,
79    #[token("initial")]
80    Initial,
81    #[token("if")]
82    If,
83    #[token("else")]
84    Else,
85    #[token("match")]
86    Match,
87    #[token("set")]
88    Set,
89
90    #[token("pipeline")]
91    Pipeline,
92    #[token("stage")]
93    Stage,
94    #[token("entity")]
95    Entity,
96    #[token("trait")]
97    Trait,
98    #[token("impl")]
99    Impl,
100    #[token("for")]
101    For,
102    #[token("fn")]
103    Function,
104    #[token("enum")]
105    Enum,
106    #[token("struct")]
107    Struct,
108    #[token("port")]
109    Port,
110    #[token("mod")]
111    Mod,
112    #[token("use")]
113    Use,
114    #[token("as")]
115    As,
116    #[token("assert")]
117    Assert,
118    #[token("mut")]
119    Mut,
120    #[token("inv")]
121    Inv,
122    #[token("where")]
123    Where,
124
125    #[token("gen")]
126    Gen,
127
128    #[token("extern")]
129    Extern,
130
131    // Math operators
132    #[token("+")]
133    Plus,
134    #[token("-")]
135    Minus,
136    #[token("*")]
137    Asterisk,
138    #[token("/")]
139    Slash,
140    #[token("%")]
141    Percentage,
142    #[token("==")]
143    Equals,
144    #[token("!=")]
145    NotEquals,
146    #[token("<")]
147    Lt,
148    #[token(">")]
149    Gt,
150    #[token("<=")]
151    Le,
152    #[token(">=")]
153    Ge,
154    #[token(">>>")]
155    ArithmeticRightShift,
156    #[token(">>")]
157    RightShift,
158    #[token("<<")]
159    LeftShift,
160    #[token("||")]
161    LogicalOr,
162    #[token("&&")]
163    LogicalAnd,
164    #[token("^^")]
165    LogicalXor,
166    #[token("&")]
167    Ampersand,
168    #[token("|")]
169    BitwiseOr,
170    #[token("!")]
171    Not,
172    #[token("^")]
173    BitwiseXor,
174    #[token("~")]
175    Tilde,
176    #[token("`")]
177    InfixOperatorSeparator,
178    #[token("'")]
179    SingleQuote,
180
181    // Other operators
182    #[token("=")]
183    Assignment,
184
185    #[token("(")]
186    OpenParen,
187    #[token(")")]
188    CloseParen,
189
190    #[token("{")]
191    OpenBrace,
192    #[token("}")]
193    CloseBrace,
194
195    #[token("[")]
196    OpenBracket,
197    #[token("]")]
198    CloseBracket,
199
200    #[token("=>")]
201    FatArrow,
202    #[token("->")]
203    SlimArrow,
204    #[token(",")]
205    Comma,
206    #[token(".")]
207    Dot,
208    #[token("..")]
209    DotDot,
210    #[token(";")]
211    Semi,
212    #[token(";")]
213    GreekQuestionMark,
214    #[token(":")]
215    Colon,
216    #[token("::")]
217    PathSeparator,
218    #[token("#")]
219    Hash,
220    #[token("$")]
221    Dollar,
222
223    #[regex("///[^\n]*", |lex| lex.slice()[3..].to_string())]
224    OutsideDocumentation(String),
225    #[regex("//![^\n]*", |lex| lex.slice()[3..].to_string())]
226    InsideDocumentation(String),
227
228    /// Ignoring whitespace
229    #[regex("[ \t\n\r]", logos::skip)]
230    Whitespace,
231
232    #[regex("//[^\n]*", logos::skip)]
233    Comment,
234
235    #[token("/*")]
236    BlockCommentStart,
237    #[token("*/")]
238    BlockCommentEnd,
239
240    Eof,
241}
242
243impl TokenKind {
244    pub fn as_str(&self) -> &'static str {
245        match self {
246            TokenKind::Identifier(_) => "identifier",
247            TokenKind::Integer(_) => "integer",
248            TokenKind::HexInteger(_) => "hexadecimal integer",
249            TokenKind::BinInteger(_) => "binary integer",
250            TokenKind::True => "true",
251            TokenKind::False => "false",
252            TokenKind::Low => "LOW",
253            TokenKind::High => "HIGH",
254            TokenKind::HighImp => "HIGHIMP",
255
256            TokenKind::Let => "let",
257            TokenKind::Reg => "reg",
258            TokenKind::Decl => "decl",
259            TokenKind::Entity => "entity",
260            TokenKind::Pipeline => "pipeline",
261            TokenKind::Stage => "stage",
262            TokenKind::Instance => "inst",
263            TokenKind::Reset => "reset",
264            TokenKind::Initial => "initial",
265            TokenKind::If => "if",
266            TokenKind::Else => "else",
267            TokenKind::Match => "match",
268            TokenKind::Impl => "impl",
269            TokenKind::Trait => "trait",
270            TokenKind::For => "for",
271            TokenKind::Function => "fn",
272            TokenKind::Enum => "enum",
273            TokenKind::Struct => "struct",
274            TokenKind::Port => "port",
275            TokenKind::Mod => "mod",
276            TokenKind::As => "as",
277            TokenKind::Use => "use",
278            TokenKind::Assert => "assert",
279            TokenKind::Set => "set",
280            TokenKind::Mut => "mut",
281            TokenKind::Inv => "inv",
282            TokenKind::Where => "where",
283
284            TokenKind::Gen => "gen",
285
286            TokenKind::Extern => "extern",
287
288            TokenKind::Assignment => "=",
289            TokenKind::Plus => "+",
290            TokenKind::Minus => "-",
291            TokenKind::Asterisk => "*",
292            TokenKind::Slash => "/",
293            TokenKind::Percentage => "%",
294            TokenKind::Equals => "==",
295            TokenKind::NotEquals => "!=",
296            TokenKind::Lt => "<",
297            TokenKind::Gt => ">",
298            TokenKind::Le => "<=",
299            TokenKind::Ge => ">=",
300            TokenKind::LeftShift => "<<",
301            TokenKind::RightShift => ">>",
302            TokenKind::ArithmeticRightShift => ">>>",
303            TokenKind::LogicalOr => "||",
304            TokenKind::LogicalAnd => "&&",
305            TokenKind::LogicalXor => "^^",
306            TokenKind::Ampersand => "&",
307            TokenKind::BitwiseOr => "|",
308            TokenKind::Not => "!",
309            TokenKind::Tilde => "~",
310            TokenKind::BitwiseXor => "^",
311            TokenKind::InfixOperatorSeparator => "`",
312
313            TokenKind::OpenParen => "(",
314            TokenKind::CloseParen => ")",
315            TokenKind::OpenBrace => "{",
316            TokenKind::CloseBrace => "}",
317            TokenKind::OpenBracket => "[",
318            TokenKind::CloseBracket => "]",
319
320            TokenKind::FatArrow => "=>",
321            TokenKind::SlimArrow => "->",
322            TokenKind::Semi => ";",
323            TokenKind::GreekQuestionMark => "GreekQuestionMark(;)",
324            TokenKind::Colon => ":",
325            TokenKind::Comma => ",",
326            TokenKind::Dot => ".",
327            TokenKind::DotDot => "..",
328            TokenKind::PathSeparator => "::",
329            TokenKind::SingleQuote => "'",
330
331            TokenKind::Hash => "#",
332            TokenKind::Dollar => "$",
333
334            TokenKind::Eof => "end of file",
335
336            TokenKind::OutsideDocumentation(_) => "///",
337            TokenKind::InsideDocumentation(_) => "//!",
338
339            TokenKind::Whitespace => "whitespace",
340            TokenKind::Comment => "comment",
341
342            TokenKind::BlockCommentStart => "/*",
343            TokenKind::BlockCommentEnd => "*/",
344        }
345    }
346
347    pub fn is_identifier(&self) -> bool {
348        matches!(self, TokenKind::Identifier(_))
349    }
350    pub fn is_integer(&self) -> bool {
351        matches!(
352            self,
353            TokenKind::Integer(_) | TokenKind::HexInteger(_) | TokenKind::BinInteger(_)
354        )
355    }
356
357    pub fn as_biguint(&self) -> Option<BigUint> {
358        match self {
359            TokenKind::Integer((i, _))
360            | TokenKind::HexInteger((i, _))
361            | TokenKind::BinInteger((i, _)) => Some(i.clone()),
362            _ => None,
363        }
364    }
365}
366
367#[cfg(test)]
368mod tests {
369    use spade_common::num_ext::InfallibleToBigUint;
370
371    use super::*;
372
373    #[test]
374    fn identifiers_work() {
375        let mut lex = TokenKind::lexer("abc123_");
376
377        assert_eq!(
378            lex.next(),
379            Some(Ok(TokenKind::Identifier("abc123_".to_string())))
380        );
381    }
382
383    #[test]
384    fn integer_literals_work() {
385        let mut lex = TokenKind::lexer("123");
386
387        assert_eq!(
388            lex.next(),
389            Some(Ok(TokenKind::Integer((
390                123_u32.to_biguint(),
391                LiteralKind::Unsized
392            ))))
393        );
394        assert_eq!(lex.next(), None);
395    }
396
397    #[test]
398    fn sized_uint_integer_literals_work() {
399        let mut lex = TokenKind::lexer("123u3");
400
401        assert_eq!(
402            lex.next(),
403            Some(Ok(TokenKind::Integer((
404                123_u32.to_biguint(),
405                LiteralKind::Unsigned(3u32.to_biguint())
406            ))))
407        );
408        assert_eq!(lex.next(), None);
409    }
410
411    #[test]
412    fn sized_int_integer_literals_work() {
413        let mut lex = TokenKind::lexer("123i3");
414
415        assert_eq!(
416            lex.next(),
417            Some(Ok(TokenKind::Integer((
418                123_u32.to_biguint(),
419                LiteralKind::Signed(3u32.to_biguint())
420            ))))
421        );
422        assert_eq!(lex.next(), None);
423    }
424
425    #[test]
426    fn hex_array() {
427        let mut lex = TokenKind::lexer("[0x45]");
428        assert_eq!(lex.next(), Some(Ok(TokenKind::OpenBracket)));
429        assert_eq!(
430            lex.next(),
431            Some(Ok(TokenKind::HexInteger((
432                0x45_u32.to_biguint(),
433                LiteralKind::Unsized
434            ))))
435        );
436        assert_eq!(lex.next(), Some(Ok(TokenKind::CloseBracket)));
437        assert_eq!(lex.next(), None);
438    }
439
440    #[test]
441    fn invalid_hex_is_not_hex() {
442        let mut lex = TokenKind::lexer("0xg");
443        assert_eq!(
444            lex.next(),
445            Some(Ok(TokenKind::Integer((
446                0_u32.to_biguint(),
447                LiteralKind::Unsized
448            ))))
449        );
450        assert_eq!(
451            lex.next(),
452            Some(Ok(TokenKind::Identifier("xg".to_string())))
453        );
454        assert_eq!(lex.next(), None);
455    }
456
457    #[test]
458    fn doc_comments_slice_correctly() {
459        let mut lex = TokenKind::lexer("//! Hello\n///G'day");
460        assert_eq!(
461            lex.next(),
462            Some(Ok(TokenKind::InsideDocumentation(" Hello".to_string())))
463        );
464        assert_eq!(
465            lex.next(),
466            Some(Ok(TokenKind::OutsideDocumentation("G'day".to_string())))
467        );
468        assert_eq!(lex.next(), None);
469    }
470}