Document parser crate.

RustPython · itsankitkp · Jan 31, 2023 · Jan 6, 2023 · Jan 31, 2023 · Feb 1, 2023
commit deca1538b565a34d060c3c351c2a2087c84184f3
diff --git a/compiler/parser/src/context.rs b/compiler/parser/src/context.rs
@@ -1,6 +1,6 @@
 use rustpython_ast::{Expr, ExprContext, ExprKind};
 
-pub fn set_context(expr: Expr, ctx: ExprContext) -> Expr {
+pub(crate) fn set_context(expr: Expr, ctx: ExprContext) -> Expr {
     match expr.node {
         ExprKind::Name { id, .. } => Expr {
             node: ExprKind::Name { id, ctx },

diff --git a/compiler/parser/src/error.rs b/compiler/parser/src/error.rs
@@ -1,40 +1,71 @@
-//! Define internal parse error types
-//! The goal is to provide a matching and a safe error API, maksing errors from LALR
+//! Error types for the parser.
+//!
+//! These types are used to represent errors that occur during lexing and parsing and are
+//! returned by the `parse_*` functions in the [parser] module and the iterator in the
+//! [lexer] implementation.
+//!
+//! [parser]: crate::parser
+//! [lexer]: crate::lexer
 
+// Define internal parse error types.
+// The goal is to provide a matching and a safe error API, masking errors from LALR
 use crate::{ast::Location, token::Tok};
 use lalrpop_util::ParseError as LalrpopError;
 use std::fmt;
 
-/// Represents an error during lexical scanning.
+/// Represents an error during lexing.
 #[derive(Debug, PartialEq)]
 pub struct LexicalError {
+    /// The type of error that occurred.
     pub error: LexicalErrorType,
+    /// The location of the error.
     pub location: Location,
 }
 
 impl LexicalError {
+    /// Creates a new `LexicalError` with the given error type and location.
     pub fn new(error: LexicalErrorType, location: Location) -> Self {
         Self { error, location }
     }
 }
 
+/// Represents the different types of errors that can occur during lexing.
 #[derive(Debug, PartialEq)]
 pub enum LexicalErrorType {
+    // TODO: Can probably be removed, the places it is used seem to be able
+    // to use the `UnicodeError` variant instead.
+    #[doc(hidden)]
     StringError,
+    // TODO: Should take a start/end position to report.
+    /// Decoding of a unicode escape sequence in a string literal failed.
     UnicodeError,
+    /// The nesting of brackets/braces/parentheses is not balanced.
     NestingError,
+    /// The indentation is not consistent.
     IndentationError,
+    /// Inconsistent use of tabs and spaces.
     TabError,
+    /// Encountered a tab after a space.
     TabsAfterSpaces,
+    /// A non-default argument follows a default argument.
     DefaultArgumentError,
+    /// A duplicate argument was found in a function definition.
     DuplicateArgumentError(String),
+    /// A positional argument follows a keyword argument.
     PositionalArgumentError,
+    /// An iterable argument unpacking `*args` follows keyword argument unpacking `**kwargs`.
     UnpackedArgumentError,
+    /// A keyword argument was repeated.
     DuplicateKeywordArgumentError(String),
+    /// An unrecognized token was encountered.
     UnrecognizedToken { tok: char },
+    /// An f-string error containing the [`FStringErrorType`].
     FStringError(FStringErrorType),
+    /// An unexpected character was encountered after a line continuation.
     LineContinuationError,
+    /// An unexpected end of file was encountered.
     Eof,
+    /// An unexpected error occurred.
     OtherError(String),
 }
 
@@ -85,13 +116,17 @@ impl fmt::Display for LexicalErrorType {
 }
 
 // TODO: consolidate these with ParseError
+/// An error that occurred during parsing of an f-string.
 #[derive(Debug, PartialEq)]
 pub struct FStringError {
+    /// The type of error that occurred.
     pub error: FStringErrorType,
+    /// The location of the error.
     pub location: Location,
 }
 
 impl FStringError {
+    /// Creates a new `FStringError` with the given error type and location.
     pub fn new(error: FStringErrorType, location: Location) -> Self {
         Self { error, location }
     }
@@ -106,19 +141,33 @@ impl From<FStringError> for LexicalError {
     }
 }
 
+/// Represents the different types of errors that can occur during parsing of an f-string.
 #[derive(Debug, PartialEq)]
 pub enum FStringErrorType {
+    /// Expected a right brace after an opened left brace.
     UnclosedLbrace,
+    /// Expected a left brace after an ending right brace.
     UnopenedRbrace,
+    /// Expected a right brace after a conversion flag.
     ExpectedRbrace,
+    /// An error occurred while parsing an f-string expression.
     InvalidExpression(Box<ParseErrorType>),
+    /// An invalid conversion flag was encountered.
     InvalidConversionFlag,
+    /// An empty expression was encountered.
     EmptyExpression,
+    /// An opening delimiter was not closed properly.
     MismatchedDelimiter(char, char),
+    /// Too many nested expressions in an f-string.
     ExpressionNestedTooDeeply,
+    /// The f-string expression cannot include the given character.
     ExpressionCannotInclude(char),
+    /// A single right brace was encountered.
     SingleRbrace,
+    /// A closing delimiter was not opened properly.
     Unmatched(char),
+    // TODO: Test this case.
+    /// Unterminated string.
     UnterminatedString,
 }
 
@@ -167,9 +216,10 @@ impl From<FStringError> for LalrpopError<Location, Tok, LexicalError> {
     }
 }
 
-/// Represents an error during parsing
+/// Represents an error during parsing.
 pub type ParseError = rustpython_compiler_core::BaseError<ParseErrorType>;
 
+/// Represents the different types of errors that can occur during parsing.
 #[derive(Debug, PartialEq, thiserror::Error)]
 pub enum ParseErrorType {
     /// Parser encountered an unexpected end of input
@@ -180,11 +230,12 @@ pub enum ParseErrorType {
     InvalidToken,
     /// Parser encountered an unexpected token
     UnrecognizedToken(Tok, Option<String>),
-    /// Maps to `User` type from `lalrpop-util`
+    // Maps to `User` type from `lalrpop-util`
+    /// Parser encountered an error during lexing.
     Lexical(LexicalErrorType),
 }
 
-/// Convert `lalrpop_util::ParseError` to our internal type
+// Convert `lalrpop_util::ParseError` to our internal type
 pub(crate) fn parse_error_from_lalrpop(
     err: LalrpopError<Location, Tok, LexicalError>,
     source_path: &str,
@@ -258,6 +309,7 @@ impl fmt::Display for ParseErrorType {
 }
 
 impl ParseErrorType {
+    /// Returns true if the error is an indentation error.
     pub fn is_indentation_error(&self) -> bool {
         match self {
             ParseErrorType::Lexical(LexicalErrorType::IndentationError) => true,
@@ -267,6 +319,8 @@ impl ParseErrorType {
             _ => false,
         }
     }
+
+    /// Returns true if the error is a tab error.
     pub fn is_tab_error(&self) -> bool {
         matches!(
             self,

diff --git a/compiler/parser/src/lib.rs b/compiler/parser/src/lib.rs
@@ -1,19 +1,119 @@
-//! This crate can be used to parse python sourcecode into a so
-//! called AST (abstract syntax tree).
+//! This crate can be used to parse Python source code into an Abstract
+//! Syntax Tree.
 //!
-//! The stages involved in this process are lexical analysis and
-//! parsing. The lexical analysis splits the sourcecode into
-//! tokens, and the parsing transforms those tokens into an AST.
+//! ## Overview:
 //!
-//! For example, one could do this:
+//! The process by which source code is parsed into an AST can be broken down
+//! into two general stages: [lexical analysis] and [parsing].
 //!
+//! During lexical analysis, the source code is converted into a stream of lexical
+//! tokens that represent the smallest meaningful units of the language. For example,
+//! the source code `print("Hello world")` would _roughly_ be converted into the following
+//! stream of tokens:
+//!
+//! ```text
+//! Name("print"), LeftParen, String("Hello world"), RightParen
 //! ```
-//! use rustpython_parser::{parser, ast};
 //!
-//! let python_source = "print('Hello world')";
-//! let python_ast = parser::parse_expression(python_source, "<embedded>").unwrap();
+//! these tokens are then consumed by the parser, which matches them against a set of
+//! grammar rules to verify that the source code is syntactically valid and to construct
+//! an AST that represents the source code.
+//!  
+//! During parsing, the parser consumes the tokens generated by the lexer and constructs
+//! a tree representation of the source code. The tree is made up of nodes that represent
+//! the different syntactic constructs of the language. If the source code is syntactically
+//! invalid, parsing fails and an error is returned. After a successful parse, the AST can
+//! be used to perform further analysis on the source code. Continuing with the example
+//! above, the AST generated by the parser would _roughly_ look something like this:
+//!
+//! ```text
+//! node: Expr {
+//!     value: {
+//!         node: Call {
+//!             func: {
+//!                 node: Name {
+//!                     id: "print",
+//!                     ctx: Load,
+//!                 },
+//!             },
+//!             args: [
+//!                 node: Constant {
+//!                     value: Str("Hello World"),
+//!                     kind: None,
+//!                 },
+//!             ],
+//!             keywords: [],
+//!         },
+//!     },
+//! },
+//!```
+//!
+//! Note: The Tokens/ASTs shown above are not the exact tokens/ASTs generated by the parser.
+//!
+//! ## Source code layout:
+//!
+//! The functionality of this crate is split into several modules:
+//!
+//! - [token]: This module contains the definition of the tokens that are generated by the lexer.
+//! - [lexer]: This module contains the lexer and is responsible for generating the tokens.
+//! - [parser]: This module contains an interface to the parser and is responsible for generating the AST.
+//!     - Functions and strings have special parsing requirements that are handled in additional files.
+//! - [mode]: This module contains the definition of the different modes that the parser can be in.
+//! - [error]: This module contains the definition of the errors that can be returned by the parser.
 //!
+//! # Examples
+//!
+//! For example, to get a stream of tokens from a given string, one could do this:
+//!
+//! ```
+//! use rustpython_parser::lexer::make_tokenizer;
+//!
+//! let python_source = r#"
+//! def is_odd(i):
+//!     return bool(i & 1)
+//! "#;
+//! let mut tokens = make_tokenizer(python_source);
+//! assert!(tokens.all(|t| t.is_ok()));
 //! ```
+//!
+//! These tokens can be directly fed into the parser to generate an AST:
+//!
+//! ```
+//! use rustpython_parser::parser::{parse_tokens, Mode};
+//! use rustpython_parser::lexer::make_tokenizer;
+//!
+//! let python_source = r#"
+//! def is_odd(i):
+//!    return bool(i & 1)
+//! "#;
+//! let tokens = make_tokenizer(python_source);
+//! let ast = parse_tokens(tokens, Mode::Module, "<embedded>");
+//!
+//! assert!(ast.is_ok());
+//! ```
+//!
+//! Alternatively, you can use one of the other `parse_*` functions to parse a string directly without using a specific
+//! mode or tokenizing the source beforehand:
+//!
+//! ```
+//! use rustpython_parser::parser::parse_program;
+//!
+//! let python_source = r#"
+//! def is_odd(i):
+//!   return bool(i & 1)
+//! "#;
+//! let ast = parse_program(python_source, "<embedded>");
+//!
+//! assert!(ast.is_ok());
+//! ```
+//!
+//! [lexical analysis]: https://en.wikipedia.org/wiki/Lexical_analysis
+//! [parsing]: https://en.wikipedia.org/wiki/Parsing
+//! [token]: crate::token
+//! [lexer]: crate::lexer
+//! [parser]: crate::parser
+//! [mode]: crate::mode
+//! [error]: crate::error
 
 #![doc(html_logo_url = "https://raw.githubusercontent.com/RustPython/RustPython/main/logo.png")]
 #![doc(html_root_url = "https://docs.rs/rustpython-parser/")]

diff --git a/compiler/parser/src/mode.rs b/compiler/parser/src/mode.rs
@@ -1,9 +1,14 @@
+//! Control in the different modes by which a source file can be parsed.
 use crate::token::Tok;
 
+/// The mode argument specifies in what way code must be parsed.
 #[derive(Clone, Copy)]
 pub enum Mode {
+    /// The code consists of a sequence of statements.
     Module,
+    /// The code consists of a sequence of interactive statement.
     Interactive,
+    /// The code consists of a single expression.
     Expression,
 }
 
@@ -39,6 +44,7 @@ impl std::str::FromStr for Mode {
     }
 }
 
+/// Returned when a given mode is not valid.
 #[derive(Debug)]
 pub struct ModeParseError {
     _priv: (),