%!TEX root = std.tex
\rSec0[lex]{Lexical conventions}

\gramSec[gram.lex]{Lexical conventions}

\indextext{lexical conventions|see{conventions, lexical}}
\indextext{translation!separate|see{compilation, separate}}
\indextext{separate translation|see{compilation, separate}}
\indextext{separate compilation|see{compilation, separate}}
\indextext{phases of translation|see{translation, phases}}
\indextext{source file character|see{character, source file}}
\indextext{alternative token|see{token, alternative}}
\indextext{digraph|see{token, alternative}}
\indextext{integer literal|see{literal, integer}}
\indextext{character literal|see{literal, character}}
\indextext{floating-point literal|see{literal, floating-point}}
\indextext{string literal|see{literal, string}}
\indextext{boolean literal|see{literal, boolean}}
\indextext{pointer literal|see{literal, pointer}}
\indextext{user-defined literal|see{literal, user-defined}}
\indextext{file, source|see{source file}}
\indextext{null character|see{character, null}}
\indextext{null wide character|see{wide-character, null}}

\rSec1[lex.separate]{Separate translation}

\pnum
\indextext{conventions!lexical|(}%
\indextext{compilation!separate|(}%
The text of the program is kept in units called
\defnx{source files}{source file} in this document.
A source file together with all the headers\iref{headers}
and source files included\iref{cpp.include} via the preprocessing
directive \tcode{\#include}, less any source lines skipped by any of the
conditional inclusion\iref{cpp.cond} preprocessing directives,
as modified by the implementation-defined behavior of any
conditionally-supported-directives\iref{cpp.pre} and pragmas\iref{cpp.pragma},
if any, is
called a \defnadj{preprocessing}{translation unit}.
\begin{note}
A \Cpp{} program need not all be translated at the same time.
Translation units can be separately translated and then later linked
to produce an executable program\iref{basic.link}.
\end{note}
\indextext{compilation!separate|)}

\rSec1[lex.phases]{Phases of translation}%

\pnum
\indextext{translation!phases|(}%
The precedence among the syntax rules of translation is specified by the
following phases.
\begin{footnote}
Implementations behave as if these separate phases
occur, although in practice different phases can be folded together.
\end{footnote}

\begin{enumerate}
\item
\indextext{character!source file}%
An implementation shall support input files
that are a sequence of UTF-8 code units (UTF-8 files).
It may also support
an \impldef{supported input files} set of other kinds of input files, and,
if so, the kind of an input file is determined in
an \impldef{determination of kind of input file} manner
that includes a means of designating input files as UTF-8 files,
independent of their content.
\begin{note}
In other words,
recognizing the \unicode{feff}{byte order mark} is not sufficient.
\end{note}
If an input file is determined to be a UTF-8 file,
then it shall be a well-formed UTF-8 code unit sequence and
it is decoded to produce a sequence of Unicode
\begin{footnote}
Unicode\textregistered\ is a registered trademark of Unicode, Inc.
This information is given for the convenience of users of this document and
does not constitute an endorsement by ISO or IEC of this product.
\end{footnote}
scalar values.
A sequence of translation character set elements\iref{lex.charset} is then formed
by mapping each Unicode scalar value
to the corresponding translation character set element.
In the resulting sequence,
each pair of characters in the input sequence consisting of
\unicode{000d}{carriage return} followed by \unicode{000a}{line feed},
as well as each
\unicode{000d}{carriage return} not immediately followed by a \unicode{000a}{line feed},
is replaced by a single new-line character.

For any other kind of input file supported by the implementation,
characters are mapped, in an
\impldef{mapping input file characters to translation character set} manner,
to a sequence of translation character set elements,
representing end-of-line indicators as new-line characters.

\item
\indextext{line splicing}%
If the first translation character is \unicode{feff}{byte order mark},
it is deleted.
Each sequence of a backslash character (\textbackslash)
immediately followed by
zero or more whitespace characters other than new-line followed by
a new-line character is deleted, splicing
physical source lines to form \defnx{logical source lines}{source line!logical}. Only the last
backslash on any physical source line shall be eligible for being part
of such a splice.
\begin{note}
Line splicing can form
a \grammarterm{universal-character-name}\iref{lex.charset}.
\end{note}
A source file that is not empty and that (after splicing)
does not end in a new-line character
shall be processed as if an additional new-line character were appended
to the file.

\item The source file is decomposed into preprocessing
tokens\iref{lex.pptoken} and sequences of whitespace characters
(including comments). A source file shall not end in a partial
preprocessing token or in a partial comment.
\begin{footnote}
A partial preprocessing
token would arise from a source file
ending in the first portion of a multi-character token that requires a
terminating sequence of characters, such as a \grammarterm{header-name}
that is missing the closing \tcode{"}
or \tcode{>}. A partial comment
would arise from a source file ending with an unclosed \tcode{/*}
comment.
\end{footnote}
Each comment\iref{lex.comment} is replaced by one space character. New-line characters are
retained. Whether each nonempty sequence of whitespace characters other
than new-line is retained or replaced by one space character is
unspecified.
As characters from the source file are consumed
to form the next preprocessing token
(i.e., not being consumed as part of a comment or other forms of whitespace),
except when matching a
\grammarterm{c-char-sequence},
\grammarterm{s-char-sequence},
\grammarterm{r-char-sequence},
\grammarterm{h-char-sequence}, or
\grammarterm{q-char-sequence},
\grammarterm{universal-character-name}s are recognized\iref{lex.universal.char} and
replaced by the designated element of the translation character set\iref{lex.charset}.
The process of dividing a source file's
characters into preprocessing tokens is context-dependent.
\begin{example}
See the handling of \tcode{<} within a \tcode{\#include} preprocessing
directive\iref{lex.header,cpp.include}.
\end{example}

\item The source file is analyzed as a \grammarterm{preprocessing-file}\iref{cpp.pre}.
Preprocessing directives\iref{cpp} are executed, macro invocations are
expanded\iref{cpp.replace}, and \tcode{_Pragma} unary operator expressions are executed\iref{cpp.pragma.op}.
A \tcode{\#include} preprocessing directive\iref{cpp.include} causes the named header or
source file to be processed from phase 1 through phase 4, recursively.
All preprocessing directives are then deleted.

\item
For a sequence of two or more adjacent \grammarterm{string-literal} preprocessing tokens,
a common \grammarterm{encoding-prefix} is determined
as specified in \ref{lex.string}.
Each such \grammarterm{string-literal} preprocessing token is then considered to have
that common \grammarterm{encoding-prefix}.

\item
\indextext{concatenation!string}%
Adjacent \grammarterm{string-literal} preprocessing tokens are concatenated\iref{lex.string}.

\item
Each preprocessing token is converted into a token\iref{lex.token}.
Whitespace characters separating tokens are no longer significant.
The resulting tokens constitute a \defn{translation unit} and
are syntactically and
semantically analyzed as a \grammarterm{translation-unit}\iref{basic.link} and
translated.
\begin{note}
The process of analyzing and translating the tokens can occasionally
result in one token being replaced by a sequence of other
tokens\iref{temp.names}.
\end{note}
It is
\impldef{whether the sources for
module units and header units
on which the current translation unit has an interface
dependency are required to be available during translation}
whether the sources for
module units and header units
on which the current translation unit has an interface
dependency\iref{module.unit,module.import}
are required to be available.
\begin{note}
Source files, translation
units and translated translation units need not necessarily be stored as
files, nor need there be any one-to-one correspondence between these
entities and any external representation. The description is conceptual
only, and does not specify any particular implementation.
\end{note}
\begin{note}
Previously translated translation units can be preserved individually or in libraries.
The separate translation units of a program communicate\iref{basic.link} by (for example)
calls to functions whose identifiers have external or module linkage,
manipulation of objects whose identifiers have external or module linkage, or
manipulation of data files.
\end{note}

While the tokens constituting translation units
are being analyzed and translated,
required instantiations are performed.
\begin{note}
This can include
instantiations which have been explicitly
requested\iref{temp.explicit}.
\end{note}

The contexts from which instantiations may be performed
are determined by their respective points of instantiation\iref{temp.point}.

\begin{note}
Other requirements in this document can further constrain
the context from which an instantiation can be performed.
For example, a constexpr function template specialization
might have a point of instantation at the end of a translation unit,
but its use in certain constant expressions could require
that it be instantiated at an earlier point\iref{temp.inst}.
\end{note}

Each instantiation results in new program constructs.
The program is ill-formed if any instantiation fails.

During the analysis and translation of tokens,
certain expressions are evaluated\iref{expr.const}.
Constructs appearing at a program point $P$ are analyzed
in a context where each side effect of evaluating an expression $E$
as a full-expression is complete if and only if
\begin{itemize}
\item
$E$ is the expression corresponding to
a \grammarterm{consteval-block-declaration}\iref{dcl.pre}, and
\item
either that \grammarterm{consteval-block-declaration} or
the template definition from which it is instantiated
is reachable from\iref{module.reach}
\begin{itemize}
\item
$P$, or
\item
the point immediately following
the \grammarterm{class-specifier} of the outermost class
for which $P$ is in a complete-class context\iref{class.mem.general}.
\end{itemize}
\end{itemize}
\begin{example}
\begin{codeblock}
class S {
  class Incomplete;

  class Inner {
    void fn() {
      /* @$p_1$@ */ Incomplete i;    // OK
    }
  }; /* @$p_2$@ */

  consteval {
    define_aggregate(^^Incomplete, {});
  }
}; /* @$p_3$@ */
\end{codeblock}
Constructs at $p_1$ are analyzed in a context
where the side effect of the call to \tcode{define_aggregate} is evaluated
because
\begin{itemize}
\item
$E$ is the expression corresponding to a consteval block, and
\item
$p_1$ is in a complete-class context of \tcode{S} and
the consteval block is reachable from $p_3$.
\end{itemize}
\end{example}

\item
\indextext{linking}%
Translated translation units are combined, and
all external entity references are resolved. Library
components are linked to satisfy external references to
entities not defined in the current translation. All such translator
output is collected into a program image which contains information
needed for execution in its execution environment.%
\indextext{translation!phases|)}
\end{enumerate}

\rSec1[lex.char]{Characters}%

\rSec2[lex.charset]{Character sets}

\pnum
\indextext{character set|(}%
The \defnadj{translation}{character set} consists of the following elements:
\begin{itemize}
\item
each abstract character assigned a code point in the Unicode codespace
as specified in the Unicode Standard, and
\item
a distinct character for each Unicode scalar value
not assigned to an abstract character.
\end{itemize}
\begin{note}
Unicode code points are integers
in the range $[0, \mathrm{10FFFF}]$ (hexadecimal).
A surrogate code point is a value
in the range $[\mathrm{D800}, \mathrm{DFFF}]$ (hexadecimal).
A Unicode scalar value is any code point that is not a surrogate code point.
\end{note}

\pnum
The \defnadj{basic}{character set} is a subset of the translation character set,
consisting of 99 characters as specified in \tref{lex.charset.basic}.
\begin{note}
Unicode short names are given only as a means to identifying the character;
the numerical value has no other meaning in this context.
\end{note}

\begin{floattable}{Basic character set}{lex.charset.basic}{lll}
\topline
\lhdrx{2}{character} & \rhdr{glyph} \\ \capsep
\ucode{0009} & \uname{character tabulation} & \\
\ucode{000b} & \uname{line tabulation} & \\
\ucode{000c} & \uname{form feed} & \\
\ucode{0020} & \uname{space} & \\
\ucode{000a} & \uname{line feed} & new-line \\
\ucode{0021} & \uname{exclamation mark} & \tcode{!} \\
\ucode{0022} & \uname{quotation mark} & \tcode{"} \\
\ucode{0023} & \uname{number sign} & \tcode{\#} \\
\ucode{0024} & \uname{dollar sign} & \tcode{\$} \\
\ucode{0025} & \uname{percent sign} & \tcode{\%} \\
\ucode{0026} & \uname{ampersand}  & \tcode{\&} \\
\ucode{0027} & \uname{apostrophe} & \tcode{'} \\
\ucode{0028} & \uname{left parenthesis} & \tcode{(} \\
\ucode{0029} & \uname{right parenthesis} & \tcode{)} \\
\ucode{002a} & \uname{asterisk} & \tcode{*} \\
\ucode{002b} & \uname{plus sign} & \tcode{+} \\
\ucode{002c} & \uname{comma} & \tcode{,} \\
\ucode{002d} & \uname{hyphen-minus} & \tcode{-} \\
\ucode{002e} & \uname{full stop} & \tcode{.} \\
\ucode{002f} & \uname{solidus} & \tcode{/} \\
\ucode{0030} .. \ucode{0039} & \uname{digit zero .. nine} & \tcode{0 1 2 3 4 5 6 7 8 9} \\
\ucode{003a} & \uname{colon} & \tcode{:} \\
\ucode{003b} & \uname{semicolon} & \tcode{;} \\
\ucode{003c} & \uname{less-than sign} & \tcode{<} \\
\ucode{003d} & \uname{equals sign} & \tcode{=} \\
\ucode{003e} & \uname{greater-than sign} & \tcode{>} \\
\ucode{003f} & \uname{question mark} & \tcode{?} \\
\ucode{0040} & \uname{commercial at} & \tcode{@} \\
\ucode{0041} .. \ucode{005a} & \uname{latin capital letter a .. z} & \tcode{A B C D E F G H I J K L M} \\
 & & \tcode{N O P Q R S T U V W X Y Z} \\
\ucode{005b} & \uname{left square bracket} & \tcode{[} \\
\ucode{005c} & \uname{reverse solidus} & \tcode{\textbackslash} \\
\ucode{005d} & \uname{right square bracket} & \tcode{]} \\
\ucode{005e} & \uname{circumflex accent} & \tcode{\caret} \\
\ucode{005f} & \uname{low line} & \tcode{_} \\
\ucode{0060} & \uname{grave accent} & \tcode{\`} \\
\ucode{0061} .. \ucode{007a} & \uname{latin small letter a .. z} & \tcode{a b c d e f g h i j k l m} \\
 & & \tcode{n o p q r s t u v w x y z} \\
\ucode{007b} & \uname{left curly bracket} & \tcode{\{} \\
\ucode{007c} & \uname{vertical line} & \tcode{|} \\
\ucode{007d} & \uname{right curly bracket} & \tcode{\}} \\
\ucode{007e} & \uname{tilde} & \tcode{\textasciitilde} \\
\end{floattable}

\pnum
The \defnadj{basic literal}{character set} consists of
all characters of the basic character set,
plus the control characters specified in \tref{lex.charset.literal}.

\begin{floattable}{Additional control characters in the basic literal character set}{lex.charset.literal}{ll}
\topline
\ohdrx{2}{character} \\ \capsep
\ucode{0000} & \uname{null} \\
\ucode{0007} & \uname{alert} \\
\ucode{0008} & \uname{backspace} \\
\ucode{000d} & \uname{carriage return} \\
\end{floattable}

\pnum
A \defn{code unit} is an integer value
of character type\iref{basic.fundamental}.
Characters in a \grammarterm{character-literal}
other than a multicharacter or non-encodable character literal or
in a \grammarterm{string-literal} are encoded as
a sequence of one or more code units, as determined
by the \grammarterm{encoding-prefix}\iref{lex.ccon,lex.string};
this is termed the respective \defnadj{literal}{encoding}.
The \defnadj{ordinary literal}{encoding} is
the encoding applied to an ordinary character or string literal.
The \defnadj{wide literal}{encoding} is the encoding applied
to a wide character or string literal.

\pnum
A literal encoding or a locale-specific encoding of one of
the execution character sets\iref{character.seq}
encodes each element of the basic literal character set as
a single code unit with non-negative value,
distinct from the code unit for any other such element.
\begin{note}
A character not in the basic literal character set
can be encoded with more than one code unit;
the value of such a code unit can be the same as
that of a code unit for an element of the basic literal character set.
\end{note}
\indextext{character!null}%
\indextext{wide-character!null}%
The \unicode{0000}{null} character is encoded as the value \tcode{0}.
No other element of the translation character set
is encoded with a code unit of value \tcode{0}.
The code unit value of each decimal digit character after the digit \tcode{0} (\ucode{0030})
shall be one greater than the value of the previous.
The ordinary and wide literal encodings are otherwise
\impldef{ordinary and wide literal encodings}.
\indextext{UTF-8}%
\indextext{UTF-16}%
\indextext{UTF-32}%
For a UTF-8, UTF-16, or UTF-32 literal,
the implementation shall encode
the Unicode scalar value
corresponding to each character of the translation character set
as specified in the Unicode Standard
for the respective Unicode encoding form.
\indextext{character set|)}

\rSec2[lex.universal.char]{Universal character names}

\begin{bnf}
\nontermdef{n-char}\br
     \textnormal{any member of the translation character set except the \unicode{007d}{right curly bracket} or new-line character}
\end{bnf}

\begin{bnf}
\nontermdef{n-char-sequence}\br
    n-char \opt{n-char-sequence}
\end{bnf}

\begin{bnf}
\nontermdef{named-universal-character}\br
    \terminal{\textbackslash N\{} n-char-sequence \terminal{\}}
\end{bnf}

\begin{bnf}
\nontermdef{hex-quad}\br
    hexadecimal-digit hexadecimal-digit hexadecimal-digit hexadecimal-digit
\end{bnf}

\begin{bnf}
\nontermdef{simple-hexadecimal-digit-sequence}\br
    hexadecimal-digit \opt{simple-hexadecimal-digit-sequence}
\end{bnf}

\begin{bnf}
\nontermdef{universal-character-name}\br
    \terminal{\textbackslash u} hex-quad\br
    \terminal{\textbackslash U} hex-quad hex-quad\br
    \terminal{\textbackslash u\{} simple-hexadecimal-digit-sequence \terminal{\}}\br
    named-universal-character
\end{bnf}

\pnum
The \grammarterm{universal-character-name} construct provides a way to name any
element in the translation character set using just the basic character set.
If a \grammarterm{universal-character-name} outside
the \grammarterm{c-char-sequence}, \grammarterm{s-char-sequence}, or
\grammarterm{r-char-sequence} of a \grammarterm{character-literal} or
\grammarterm{string-literal}
(in either case, including within a \grammarterm{user-defined-literal})
corresponds to a control character or to a character in the basic character set,
the program is ill-formed.
\begin{note}
A sequence of characters resembling a \grammarterm{universal-character-name} in an
\grammarterm{r-char-sequence}\iref{lex.string} does not form a
\grammarterm{universal-character-name}.
\end{note}

\pnum
A \grammarterm{universal-character-name}
of the form \tcode{\textbackslash u} \grammarterm{hex-quad},
\tcode{\textbackslash U} \grammarterm{hex-quad} \grammarterm{hex-quad}, or
\tcode{\textbackslash u\{\grammarterm{simple-hexadecimal-digit-sequence}\}}
designates the character in the translation character set
whose Unicode scalar value is the hexadecimal number represented by
the sequence of \grammarterm{hexadecimal-digit}s
in the \grammarterm{universal-character-name}.
The program is ill-formed if that number is not a Unicode scalar value.

\pnum
A \grammarterm{universal-character-name}
that is a \grammarterm{named-universal-character}
designates the corresponding character
in the Unicode Standard (chapter 4.8 Name)
if the \grammarterm{n-char-sequence} is equal
to its character name or
to one of its character name aliases of
type ``control'', ``correction'', or ``alternate'';
otherwise, the program is ill-formed.
\begin{note}
These aliases are listed in
the Unicode Character Database's \tcode{NameAliases.txt}.
None of these names or aliases have leading or trailing spaces.
\end{note}

\rSec1[lex.comment]{Comments}

\pnum
\indextext{comment|(}%
\indextext{comment!\tcode{/*} \tcode{*/}}%
\indextext{comment!\tcode{//}}%
The characters \tcode{/*} start a comment, which terminates with the
characters \tcode{*/}. These comments do not nest.
\indextext{comment!\tcode{//}}%
The characters \tcode{//} start a comment, which terminates immediately before the
next new-line character.
\begin{note}
The comment characters \tcode{//}, \tcode{/*},
and \tcode{*/} have no special meaning within a \tcode{//} comment and
are treated just like other characters. Similarly, the comment
characters \tcode{//} and \tcode{/*} have no special meaning within a
\tcode{/*} comment.
\end{note}
\indextext{comment|)}

\rSec1[lex.pptoken]{Preprocessing tokens}

\indextext{token!preprocessing|(}%
\begin{bnf}
\nontermdef{preprocessing-token}\br
    header-name\br
    import-keyword\br
    module-keyword\br
    export-keyword\br
    identifier\br
    pp-number\br
    character-literal\br
    user-defined-character-literal\br
    string-literal\br
    user-defined-string-literal\br
    preprocessing-op-or-punc\br
    \textnormal{each non-whitespace character that cannot be one of the above}
\end{bnf}

\pnum
A preprocessing token is the minimal lexical element of the language in translation
phases 3 through 6.
In this document,
glyphs are used to identify
elements of the basic character set\iref{lex.charset}.
The categories of preprocessing token are: header names,
placeholder tokens produced by preprocessing \tcode{import} and \tcode{module} directives
(\grammarterm{import-keyword}, \grammarterm{module-keyword}, and \grammarterm{export-keyword}),
identifiers, preprocessing numbers, character literals (including user-defined character
literals), string literals (including user-defined string literals), preprocessing
operators and punctuators, and single non-whitespace characters that do not lexically
match the other preprocessing token categories.
If a \unicode{0027}{apostrophe} or a \unicode{0022}{quotation mark} character
matches the last category, the program is ill-formed.
If any character not in the basic character set matches the last category,
the program is ill-formed.
Preprocessing tokens can be separated by
\indextext{whitespace}%
whitespace;
\indextext{comment}%
this consists of comments\iref{lex.comment}, or whitespace characters
(\unicode{0020}{space},
\unicode{0009}{character tabulation},
new-line,
\unicode{000b}{line tabulation}, and
\unicode{000c}{form feed}), or both.
As described in \ref{cpp}, in certain
circumstances during translation phase 4, whitespace (or the absence
thereof) serves as more than preprocessing token separation. Whitespace
can appear within a preprocessing token only as part of a header name or
between the quotation characters in a character literal or
string literal.

\pnum
Each preprocessing token that is converted to a token\iref{lex.token}
shall have the lexical form of a keyword, an identifier, a literal,
or an operator or punctuator.

\pnum
The \grammarterm{import-keyword} is produced
by processing an \keyword{import} directive\iref{cpp.import},
the \grammarterm{module-keyword} is produced
by preprocessing a \keyword{module} directive\iref{cpp.module}, and
the \grammarterm{export-keyword} is produced
by preprocessing either of the previous two directives.
\begin{note}
None has any observable spelling.
\end{note}

\pnum
If the input stream has been parsed into preprocessing tokens up to a
given character:
\begin{itemize}
\item
\indextext{literal!string!raw}%
If the next character begins a sequence of characters that could be the prefix
and initial double quote of a raw string literal, such as \tcode{R"}, the next preprocessing
token shall be a raw string literal. Between the initial and final
double quote characters of the raw string, any transformations performed in phase
2 (line splicing) are reverted; this reversion
shall apply before any \grammarterm{d-char}, \grammarterm{r-char}, or delimiting
parenthesis is identified. The raw string literal is defined as the shortest sequence
of characters that matches the raw-string pattern
\begin{ncbnf}
\opt{encoding-prefix} \terminal{R} raw-string
\end{ncbnf}

\item Otherwise, if the next three characters are \tcode{<::} and the subsequent character
is neither \tcode{:} nor \tcode{>}, the \tcode{<} is treated as a preprocessing token by
itself and not as the first character of the alternative token \tcode{<:}.

\item
Otherwise, if the next three characters are \tcode{[::} and
the subsequent character is not \tcode{:}, or
if the next three characters are \tcode{[:>},
the \tcode{[} is treated as a preprocessing token by itself and
not as the first character of the preprocessing token \tcode{[:}.
\begin{note}
The tokens \tcode{[:} and \tcode{:]} cannot be composed from digraphs.
\end{note}

\item Otherwise,
the next preprocessing token is the longest sequence of
characters that could constitute a preprocessing token, even if that
would cause further lexical analysis to fail,
except that
\begin{itemize}
\item
a \grammarterm{string-literal} token is never formed
when a \grammarterm{header-name} token can be formed, and
\item
a \grammarterm{header-name}\iref{lex.header} is only formed
\begin{itemize}
\item
immediately after the \tcode{include}, \tcode{embed}, or \tcode{import} preprocessing token in a
\tcode{\#include}\iref{cpp.include}, \tcode{\#embed}\iref{cpp.embed}, or
\tcode{import}\iref{cpp.import} directive, respectively, or
\item
immediately after a preprocessing token sequence of \xname{has_include}
or \xname{has_embed} immediately followed by \tcode{(}
in a \tcode{\#if}, \tcode{\#elif}, or \tcode{\#embed} directive\iref{cpp.cond,cpp.embed}.
\end{itemize}
\end{itemize}
\end{itemize}

\pnum
\begin{example}
\begin{codeblock}
#define R "x"
const char* s = R"y";           // ill-formed raw string, not \tcode{"x" "y"}
\end{codeblock}
\end{example}

\pnum
\begin{example}
The program fragment \tcode{0xe+foo} is parsed as a
preprocessing number token (one that is not a valid
\grammarterm{integer-literal} or \grammarterm{floating-point-literal} token),
even though a parse as three preprocessing tokens
\tcode{0xe}, \tcode{+}, and \tcode{foo} can produce a valid expression (for example,
if \tcode{foo} is a macro defined as \tcode{1}). Similarly, the
program fragment \tcode{1E1} is parsed as a preprocessing number (one
that is a valid \grammarterm{floating-point-literal} token),
whether or not \tcode{E} is a macro name.
\end{example}

\pnum
\begin{example}
The program fragment \tcode{x+++++y} is parsed as \tcode{x
++ ++ + y}, which, if \tcode{x} and \tcode{y} have integral types,
violates a constraint on increment operators, even though the parse
\tcode{x ++ + ++ y} can yield a correct expression.
\end{example}
\indextext{token!preprocessing|)}

\rSec1[lex.header]{Header names}

\indextext{header!name|(}%
\begin{bnf}
\microtypesetup{protrusion=false}
\nontermdef{header-name}\br
    \terminal{<} h-char-sequence \terminal{>}\br
    \terminal{"} q-char-sequence \terminal{"}
\end{bnf}

\begin{bnf}
\nontermdef{h-char-sequence}\br
    h-char \opt{h-char-sequence}
\end{bnf}

\begin{bnf}
\nontermdef{h-char}\br
    \textnormal{any member of the translation character set except new-line and \unicode{003e}{greater-than sign}}
\end{bnf}

\begin{bnf}
\nontermdef{q-char-sequence}\br
    q-char \opt{q-char-sequence}
\end{bnf}

\begin{bnf}
\nontermdef{q-char}\br
    \textnormal{any member of the translation character set except new-line and \unicode{0022}{quotation mark}}
\end{bnf}

\pnum
The sequences in both forms of \grammarterm{header-name}{s} are mapped in an
\impldef{mapping header name to header or external source file} manner to headers or to
external source file names as specified in~\ref{cpp.include}.
\begin{note}
Header name preprocessing tokens appear only within
a \tcode{\#include} preprocessing directive,
a \tcode{__has_include} preprocessing expression, or
after certain occurrences of an \tcode{import} token
(see~\ref{lex.pptoken}).
\end{note}

\pnum
The appearance of either of the characters \tcode{'} or \tcode{\textbackslash} or of
either of the character sequences \tcode{/*} or \tcode{//} in a
\grammarterm{q-char-sequence} or an \grammarterm{h-char-sequence}
is conditionally-supported with \impldef{meaning of \tcode{'}, \tcode{\textbackslash},
\tcode{/*}, or \tcode{//} in a \grammarterm{q-char-sequence} or an
\grammarterm{h-char-sequence}} semantics, as is the appearance of the character
\tcode{"} in an \grammarterm{h-char-sequence}.
\begin{note}
Thus, a sequence of characters
that resembles an escape sequence can result in an error, be interpreted as the
character corresponding to the escape sequence, or have a completely different meaning,
depending on the implementation.
\end{note}
\indextext{header!name|)}

\rSec1[lex.ppnumber]{Preprocessing numbers}

\indextext{number!preprocessing|(}%
\begin{bnf}
\nontermdef{pp-number}\br
    digit\br
    \terminal{.} digit\br
    pp-number identifier-continue\br
    pp-number \terminal{'} digit\br
    pp-number \terminal{'} nondigit\br
    pp-number \terminal{e} sign\br
    pp-number \terminal{E} sign\br
    pp-number \terminal{p} sign\br
    pp-number \terminal{P} sign\br
    pp-number \terminal{.}
\end{bnf}

\pnum
Preprocessing number tokens lexically include
all \grammarterm{integer-literal} tokens\iref{lex.icon} and
all \grammarterm{floating-point-literal} tokens\iref{lex.fcon}.

\pnum
A preprocessing number does not have a type or a value; it acquires both
after a successful conversion to
an \grammarterm{integer-literal} token or
a \grammarterm{floating-point-literal} token.%
\indextext{number!preprocessing|)}

\rSec1[lex.operators]{Operators and punctuators}

\pnum
\indextext{operator|(}%
\indextext{punctuator|(}%
The lexical representation of \Cpp{} programs includes a number of
preprocessing tokens that are used in the syntax of the preprocessor or
are converted into tokens for operators and punctuators:

\begin{bnf}
\nontermdef{preprocessing-op-or-punc}\br
    preprocessing-operator\br
    operator-or-punctuator
\end{bnf}

\begin{bnf}
%% Ed. note: character protrusion would misalign various operators.
\microtypesetup{protrusion=false}
\nontermdef{preprocessing-operator} \textnormal{one of}\br
    \terminal{\# \ \ \ \ \ \ \ \#\# \ \ \ \ \ \ \%: \ \ \ \ \ \ \%:\%:}
\end{bnf}

\begin{bnf}
\microtypesetup{protrusion=false}
\nontermdef{operator-or-punctuator} \textnormal{one of}\br
    \terminal{\{ \ \ \ \ \ \ \ \} \ \ \ \ \ \ \ [ \ \ \ \ \ \ \ ] \ \ \ \ \ \ \ ( \ \ \ \ \ \ \ ) \ \ \ \ \ \ \ [: \ \ \ \ \ \ :]}\br
    \terminal{<\% \ \ \ \ \ \ \%> \ \ \ \ \ \ <: \ \ \ \ \ \ :> \ \ \ \ \ \ ; \ \ \ \ \ \ \ : \ \ \ \ \ \ \ ...}\br
    \terminal{? \ \ \ \ \ \ \ :: \ \ \ \ \ \ . \ \ \ \ \ \ \ .* \ \ \ \ \ \ -> \ \ \ \ \ \ ->* \ \ \ \ \ \caret{}\caret{} \ \ \ \ \ \ \~}\br
    \terminal{! \ \ \ \ \ \ \ + \ \ \ \ \ \ \ - \ \ \ \ \ \ \ * \ \ \ \ \ \ \ / \ \ \ \ \ \ \ \% \ \ \ \ \ \ \ \caret{} \ \ \ \ \ \ \ \& \ \ \ \ \ \ \ |}\br
    \terminal{= \ \ \ \ \ \ \ += \ \ \ \ \ \ -= \ \ \ \ \ \ *= \ \ \ \ \ \ /= \ \ \ \ \ \ \%= \ \ \ \ \ \ \caret{}= \ \ \ \ \ \ \&= \ \ \ \ \ \ |=}\br
    \terminal{== \ \ \ \ \ \ != \ \ \ \ \ \ < \ \ \ \ \ \ \ > \ \ \ \ \ \ \ <= \ \ \ \ \ \ >= \ \ \ \ \ \ <=> \ \ \ \ \ \&\& \ \ \ \ \ \ ||}\br
    \terminal{<< \ \ \ \ \ \ >> \ \ \ \ \ \ <<= \ \ \ \ \ >>= \ \ \ \ \ ++ \ \ \ \ \ \ -- \ \ \ \ \ \ ,}\br
    \terminal{\keyword{and} \ \ \ \ \ \keyword{or} \ \ \ \ \ \ \keyword{xor} \ \ \ \ \ \keyword{not} \ \ \ \ \ \keyword{bitand} \ \ \keyword{bitor} \ \ \ \keyword{compl}}\br
    \terminal{\keyword{and_eq} \ \ \keyword{or_eq} \ \ \ \keyword{xor_eq} \ \ \keyword{not_eq}}
\end{bnf}

Each \grammarterm{operator-or-punctuator} is converted to a single token
in translation phase 7\iref{lex.phases}.%
\indextext{punctuator|)}%
\indextext{operator|)}

\rSec1[lex.digraph]{Alternative tokens}

\pnum
\indextext{token!alternative|(}%
Alternative token representations are provided for some operators and
punctuators.
\begin{footnote}
\indextext{digraph}%
These include ``digraphs'' and additional reserved words. The term
``digraph'' (token consisting of two characters) is not perfectly
descriptive, since one of the alternative \grammarterm{preprocessing-token}s is
\tcode{\%:\%:} and of course several primary tokens contain two
characters. Nonetheless, those alternative tokens that aren't lexical
keywords are colloquially known as ``digraphs''.
\end{footnote}

\pnum
In all respects of the language, each alternative token behaves the
same, respectively, as its primary token, except for its spelling.
\begin{footnote}
Thus the ``stringized'' values\iref{cpp.stringize} of
\tcode{[} and \tcode{<:} will be different, maintaining the source
spelling, but the tokens can otherwise be freely interchanged.
\end{footnote}
The set of alternative tokens is defined in
\tref{lex.digraph}.

\begin{tokentable}{Alternative tokens}{lex.digraph}{Alternative}{Primary}
\tcode{<\%}             &   \tcode{\{}         &
\keyword{and}           &   \tcode{\&\&}       &
\keyword{and_eq}        &   \tcode{\&=}        \\ \rowsep
\tcode{\%>}             &   \tcode{\}}         &
\keyword{bitor}         &   \tcode{|}          &
\keyword{or_eq}         &   \tcode{|=}         \\ \rowsep
\tcode{<:}              &   \tcode{[}          &
\keyword{or}            &   \tcode{||}         &
\keyword{xor_eq}        &   \tcode{\caret=}    \\ \rowsep
\tcode{:>}              &   \tcode{]}          &
\keyword{xor}           &   \tcode{\caret}     &
\keyword{not}           &   \tcode{!}          \\ \rowsep
\tcode{\%:}             &   \tcode{\#}         &
\keyword{compl}         &   \tcode{\~}         &
\keyword{not_eq}        &   \tcode{!=}         \\ \rowsep
\tcode{\%:\%:}          &   \tcode{\#\#}       &
\keyword{bitand}        &   \tcode{\&}         &
                        &                      \\
\end{tokentable}%
\indextext{token!alternative|)}

\rSec1[lex.token]{Tokens}

\indextext{token|(}%
\begin{bnf}
\nontermdef{token}\br
    identifier\br
    keyword\br
    literal\br
    operator-or-punctuator
\end{bnf}

\pnum
\indextext{\idxgram{token}}%
There are five kinds of tokens: identifiers, keywords, literals,%
\begin{footnote}
Literals include strings and character and numeric literals.
\end{footnote}
operators, and other separators.
\indextext{whitespace}%
Blanks, horizontal and vertical tabs, newlines, formfeeds, and comments
(collectively, ``whitespace''), as described below, are ignored except
as they serve to separate tokens.
\begin{note}
Whitespace can separate otherwise adjacent identifiers, keywords, numeric
literals, and alternative tokens containing alphabetic characters.
\end{note}
\indextext{token|)}

\rSec1[lex.name]{Identifiers}
\indextext{XID_Start}%
\indextext{XID_Continue}%

\indextext{identifier|(}%
\begin{bnf}
\nontermdef{identifier}\br
    identifier-start\br
    identifier identifier-continue
\end{bnf}

\begin{bnf}
\nontermdef{identifier-start}\br
    nondigit\br
    \textnormal{an element of the translation character set with the Unicode property XID_Start}
\end{bnf}

\begin{bnf}
\nontermdef{identifier-continue}\br
    digit\br
    nondigit\br
    \textnormal{an element of the translation character set with the Unicode property XID_Continue}
\end{bnf}

\begin{bnf}
\nontermdef{nondigit} \textnormal{one of}\br
    \terminal{a b c d e f g h i j k l m}\br
    \terminal{n o p q r s t u v w x y z}\br
    \terminal{A B C D E F G H I J K L M}\br
    \terminal{N O P Q R S T U V W X Y Z _}
\end{bnf}

\begin{bnf}
\nontermdef{digit} \textnormal{one of}\br
    \terminal{0 1 2 3 4 5 6 7 8 9}
\end{bnf}

\pnum
\indextext{name!length of}%
\indextext{name}%
\begin{note}
The character properties XID_Start and XID_Continue are described by \UAX{44} of the Unicode Standard.
\begin{footnote}
On systems in which linkers cannot accept extended
characters, an encoding of the \grammarterm{universal-character-name} can be used in
forming valid external identifiers. For example, some otherwise unused
character or sequence of characters can be used to encode the
\tcode{\textbackslash u} in a \grammarterm{universal-character-name}. Extended
characters can produce a long external identifier, but \Cpp{} does not
place a translation limit on significant characters for external
identifiers.
\end{footnote}
\end{note}
The program is ill-formed
if an \grammarterm{identifier} does not conform to
Normalization Form C as specified in the Unicode Standard.
\begin{note}
Identifiers are case-sensitive.
\end{note}
\begin{note}
\ref{uaxid} compares the requirements of \UAX{31} of the Unicode Standard
with the \Cpp{} rules for identifiers.
\end{note}
\begin{note}
In translation phase 4,
\grammarterm{identifier} also includes
those \grammarterm{preprocessing-token}s\iref{lex.pptoken}
differentiated as keywords\iref{lex.key}
in the later translation phase 7\iref{lex.token}.
\end{note}

\pnum
\indextext{\idxcode{import}}%
\indextext{\idxcode{final}}%
\indextext{\idxcode{module}}%
\indextext{\idxcode{override}}%
\indextext{\idxcode{replaceable_if_eligible}}%
\indextext{\idxcode{trivially_relocatable_if_eligible}}%
The identifiers in \tref{lex.name.special} have a special meaning when
appearing in a certain context. When referred to in the grammar, these identifiers
are used explicitly rather than using the \grammarterm{identifier} grammar production.
Unless otherwise specified, any ambiguity as to whether a given
\grammarterm{identifier} has a special meaning is resolved to interpret the
token as a regular \grammarterm{identifier}.

\begin{multicolfloattable}{Identifiers with special meaning}{lex.name.special}
{llll}
\keyword{final}                                 \\
\keyword{override}                              \\\columnbreak
\keyword{import}                                \\
\keyword{module}                                \\\columnbreak
\keyword{post}                                  \\
\keyword{pre}                                   \\\columnbreak
\keyword{replaceable_if_eligible}               \\
\keyword{trivially_relocatable_if_eligible}     \\
\end{multicolfloattable}

\pnum
\indextext{\idxcode{_}|see{character, underscore}}%
\indextext{character!underscore!in identifier}%
\indextext{reserved identifier}%
In addition, some identifiers
appearing as a \grammarterm{token} or \grammarterm{preprocessing-token}
are reserved for use by \Cpp{}
implementations and shall
not be used otherwise; no diagnostic is required.
\begin{itemize}
\item
Each identifier that contains a double underscore
\tcode{\unun}
\indextext{character!underscore}%
or begins with an underscore followed by
an uppercase letter,
other than those specified in this document
(for example, \xname{cplusplus}\iref{cpp.predefined}),
\indextext{uppercase}%
is reserved to the implementation for any use.
\item
Each identifier that begins with an underscore is
\indextext{character!underscore}%
reserved to the implementation for use as a name in the global namespace.%
\indextext{namespace!global}
\end{itemize}%
\indextext{identifier|)}

\rSec1[lex.key]{Keywords}

\begin{bnf}
\nontermdef{keyword}\br
    \textnormal{any identifier listed in \tref{lex.key}}\br
    \grammarterm{import-keyword}\br
    \grammarterm{module-keyword}\br
    \grammarterm{export-keyword}
\end{bnf}

\pnum
\indextext{keyword|(}%
The identifiers shown in \tref{lex.key} are reserved for use
as keywords (that is, they are unconditionally treated as keywords in
phase 7) except in an \grammarterm{attribute-token}\iref{dcl.attr.grammar}.
\begin{note}
The \keyword{register} keyword is unused but
is reserved for future use.
\end{note}

\begin{multicolfloattable}{Keywords}{lex.key}
{lllll}
\keyword{alignas} \\
\keyword{alignof} \\
\keyword{asm} \\
\keyword{auto} \\
\keyword{bool} \\
\keyword{break} \\
\keyword{case} \\
\keyword{catch} \\
\keyword{char} \\
\keyword{char8_t} \\
\keyword{char16_t} \\
\keyword{char32_t} \\
\keyword{class} \\
\keyword{concept} \\
\keyword{const} \\
\keyword{consteval} \\
\keyword{constexpr} \\
\columnbreak
\keyword{constinit} \\
\keyword{const_cast} \\
\keyword{continue} \\
\keyword{contract_assert} \\
\keyword{co_await} \\
\keyword{co_return} \\
\keyword{co_yield} \\
\keyword{decltype} \\
\keyword{default} \\
\keyword{delete} \\
\keyword{do} \\
\keyword{double} \\
\keyword{dynamic_cast} \\
\keyword{else} \\
\keyword{enum} \\
\keyword{explicit} \\
\keyword{export} \\
\columnbreak
\keyword{extern} \\
\keyword{false} \\
\keyword{float} \\
\keyword{for} \\
\keyword{friend} \\
\keyword{goto} \\
\keyword{if} \\
\keyword{inline} \\
\keyword{int} \\
\keyword{long} \\
\keyword{mutable} \\
\keyword{namespace} \\
\keyword{new} \\
\keyword{noexcept} \\
\keyword{nullptr} \\
\keyword{operator} \\
\keyword{private} \\
\columnbreak
\keyword{protected} \\
\keyword{public} \\
\keyword{register} \\
\keyword{reinterpret_cast} \\
\keyword{requires} \\
\keyword{return} \\
\keyword{short} \\
\keyword{signed} \\
\keyword{sizeof} \\
\keyword{static} \\
\keyword{static_assert} \\
\keyword{static_cast} \\
\keyword{struct} \\
\keyword{switch} \\
\keyword{template} \\
\keyword{this} \\
\keyword{thread_local} \\
\columnbreak
\keyword{throw} \\
\keyword{true} \\
\keyword{try} \\
\keyword{typedef} \\
\keyword{typeid} \\
\keyword{typename} \\
\keyword{union} \\
\keyword{unsigned} \\
\keyword{using} \\
\keyword{virtual} \\
\keyword{void} \\
\keyword{volatile} \\
\keyword{wchar_t} \\
\keyword{while} \\
\end{multicolfloattable}

\pnum
Furthermore, the alternative representations shown in
\tref{lex.key.digraph} for certain operators and
punctuators\iref{lex.digraph} are reserved and shall not be used
otherwise.

\begin{floattable}{Alternative representations}{lex.key.digraph}
{llllll}
\topline
\keyword{and}     &   \keyword{and_eq}  &   \keyword{bitand}  &   \keyword{bitor}   &   \keyword{compl}   &   \keyword{not} \\
\keyword{not_eq}  &   \keyword{or}      &   \keyword{or_eq}   &   \keyword{xor}     &   \keyword{xor_eq}  &       \\
\end{floattable}%
\indextext{keyword|)}%


\rSec1[lex.literal]{Literals}%
\indextext{literal|(}

\rSec2[lex.literal.kinds]{Kinds of literals}

\pnum
\indextext{constant}%
\indextext{literal!constant}%
There are several kinds of literals.
\begin{footnote}
The term ``literal'' generally designates, in this
document, those tokens that are called ``constants'' in C.
\end{footnote}

\begin{bnf}
\nontermdef{literal}\br
    integer-literal\br
    character-literal\br
    floating-point-literal\br
    string-literal\br
    boolean-literal\br
    pointer-literal\br
    user-defined-literal
\end{bnf}
\begin{note}
When appearing as an \grammarterm{expression},
a literal has a type and a value category\iref{expr.prim.literal}.
\end{note}

\rSec2[lex.icon]{Integer literals}

\indextext{literal!integer}%
\begin{bnf}
\nontermdef{integer-literal}\br
    binary-literal \opt{integer-suffix}\br
    octal-literal \opt{integer-suffix}\br
    decimal-literal \opt{integer-suffix}\br
    hexadecimal-literal \opt{integer-suffix}
\end{bnf}

\begin{bnf}
\nontermdef{binary-literal}\br
    \terminal{0b} binary-digit\br
    \terminal{0B} binary-digit\br
    binary-literal \opt{\terminal{'}} binary-digit
\end{bnf}

\begin{bnf}
\nontermdef{octal-literal}\br
    \terminal{0}\br
    octal-literal \opt{\terminal{'}} octal-digit
\end{bnf}

\begin{bnf}
\nontermdef{decimal-literal}\br
    nonzero-digit\br
    decimal-literal \opt{\terminal{'}} digit
\end{bnf}

\begin{bnf}
\nontermdef{hexadecimal-literal}\br
    hexadecimal-prefix hexadecimal-digit-sequence
\end{bnf}

\begin{bnf}
\nontermdef{binary-digit} \textnormal{one of}\br
    \terminal{0  1}
\end{bnf}

\begin{bnf}
\nontermdef{octal-digit} \textnormal{one of}\br
    \terminal{0  1  2  3  4  5  6  7}
\end{bnf}

\begin{bnf}
\nontermdef{nonzero-digit} \textnormal{one of}\br
    \terminal{1  2  3  4  5  6  7  8  9}
\end{bnf}

\begin{bnf}
\nontermdef{hexadecimal-prefix} \textnormal{one of}\br
    \terminal{0x  0X}
\end{bnf}

\begin{bnf}
\nontermdef{hexadecimal-digit-sequence}\br
    hexadecimal-digit\br
    hexadecimal-digit-sequence \opt{\terminal{'}} hexadecimal-digit
\end{bnf}

\begin{bnf}
\nontermdef{hexadecimal-digit} \textnormal{one of}\br
    \terminal{0  1  2  3  4  5  6  7  8  9}\br
    \terminal{a  b  c  d  e  f}\br
    \terminal{A  B  C  D  E  F}
\end{bnf}

\begin{bnf}
\nontermdef{integer-suffix}\br
    unsigned-suffix \opt{long-suffix} \br
    unsigned-suffix \opt{long-long-suffix} \br
    unsigned-suffix \opt{size-suffix} \br
    long-suffix \opt{unsigned-suffix} \br
    long-long-suffix \opt{unsigned-suffix} \br
    size-suffix \opt{unsigned-suffix}
\end{bnf}

\begin{bnf}
\nontermdef{unsigned-suffix} \textnormal{one of}\br
    \terminal{u  U}
\end{bnf}

\begin{bnf}
\nontermdef{long-suffix} \textnormal{one of}\br
    \terminal{l  L}
\end{bnf}

\begin{bnf}
\nontermdef{long-long-suffix} \textnormal{one of}\br
    \terminal{ll  LL}
\end{bnf}

\begin{bnf}
\nontermdef{size-suffix} \textnormal{one of}\br
   \terminal{z  Z}
\end{bnf}

\pnum
\indextext{literal!\idxcode{unsigned}}%
\indextext{literal!\idxcode{long}}%
\indextext{literal!base of integer}%
In an \grammarterm{integer-literal},
the sequence of
\grammarterm{binary-digit}s,
\grammarterm{octal-digit}s,
\grammarterm{digit}s, or
\grammarterm{hexadecimal-digit}s
is interpreted as a base $N$ integer as shown in \tref{lex.icon.base};
the lexically first digit of the sequence of digits is the most significant.
\begin{note}
The prefix and any optional separating single quotes are ignored
when determining the value.
\end{note}

\begin{simpletypetable}
{Base of \grammarterm{integer-literal}{s}}
{lex.icon.base}
{lr}
\topline
\lhdr{Kind of \grammarterm{integer-literal}} & \rhdr{base $N$} \\ \capsep
\grammarterm{binary-literal} & 2 \\
\grammarterm{octal-literal} & 8 \\
\grammarterm{decimal-literal} & 10 \\
\grammarterm{hexadecimal-literal} & 16 \\
\end{simpletypetable}

\pnum
The \grammarterm{hexadecimal-digit}s
\tcode{a} through \tcode{f} and \tcode{A} through \tcode{F}
have decimal values ten through fifteen.
\begin{example}
The number twelve can be written \tcode{12}, \tcode{014},
\tcode{0XC}, or \tcode{0b1100}. The \grammarterm{integer-literal}s \tcode{1048576},
\tcode{1'048'576}, \tcode{0X100000}, \tcode{0x10'0000}, and
\tcode{0'004'000'000} all have the same value.
\end{example}

\pnum
\indextext{literal!\idxcode{long}}%
\indextext{literal!\idxcode{unsigned}}%
\indextext{literal!integer}%
\indextext{literal!type of integer}%
\indextext{suffix!\idxcode{L}}%
\indextext{suffix!\idxcode{U}}%
\indextext{suffix!\idxcode{l}}%
\indextext{suffix!\idxcode{u}}%
The type of an \grammarterm{integer-literal} is
the first type in the list in \tref{lex.icon.type}
corresponding to its optional \grammarterm{integer-suffix}
in which its value can be represented.

\begin{floattable}{Types of \grammarterm{integer-literal}s}{lex.icon.type}{l|l|l}
\topline
\lhdr{\grammarterm{integer-suffix}} & \chdr{\grammarterm{decimal-literal}}  & \rhdr{\grammarterm{integer-literal} other than \grammarterm{decimal-literal}}   \\  \capsep
none    &
  \tcode{int} &
  \tcode{int}\\
        &
  \tcode{long int} &
  \tcode{unsigned int}\\
        &
  \tcode{long long int} &
  \tcode{long int}\\
        &
        &
  \tcode{unsigned long int}\\
        &
        &
  \tcode{long long int}\\
        &
        &
  \tcode{unsigned long long int}\\\hline
\tcode{u} or \tcode{U}  &
  \tcode{unsigned int}  &
  \tcode{unsigned int}\\
                              &
  \tcode{unsigned long int}   &
  \tcode{unsigned long int}\\
                              &
  \tcode{unsigned long long int}   &
  \tcode{unsigned long long int}\\\hline
\tcode{l} or \tcode{L}  &
  \tcode{long int}  &
  \tcode{long int}\\
                              &
  \tcode{long long int}       &
  \tcode{unsigned long int}\\
                              &
                              &
  \tcode{long long int}\\
                              &
                              &
  \tcode{unsigned long long int}\\\hline
Both \tcode{u} or \tcode{U}   &
  \tcode{unsigned long int}  &
  \tcode{unsigned long int}\\
and \tcode{l} or \tcode{L}  &
  \tcode{unsigned long long int}  &
  \tcode{unsigned long long int}\\\hline
\tcode{ll} or \tcode{LL}  &
  \tcode{long long int}       &
  \tcode{long long int}\\
                              &
                              &
  \tcode{unsigned long long int}\\\hline
Both \tcode{u} or \tcode{U}   &
  \tcode{unsigned long long int}  &
  \tcode{unsigned long long int}\\
and \tcode{ll} or \tcode{LL}  &
                              &
                              \\\hline
\tcode{z} or \tcode{Z}                  &
  the signed integer type corresponding &
  the signed integer type \\
                                        &
  \qquad to \tcode{std::size_t}\iref{support.types.layout} &
  \qquad corresponding to \tcode{std::size_t} \\
                                        &
                                        &
  \tcode{std::size_t}\\\hline
Both \tcode{u} or \tcode{U}   &
  \tcode{std::size_t}         &
  \tcode{std::size_t}         \\
and \tcode{z} or \tcode{Z}  &
                              &
                              \\
\end{floattable}

\pnum
Except for \grammarterm{integer-literal}{s} containing
a \grammarterm{size-suffix},
if the value of an \grammarterm{integer-literal}
cannot be represented by any type in its list and
an extended integer type\iref{basic.fundamental} can represent its value,
it may have that extended integer type.
If all of the types in the list for the \grammarterm{integer-literal}
are signed,
the extended integer type is signed.
If all of the types in the list for the \grammarterm{integer-literal}
are unsigned,
the extended integer type is unsigned.
If the list contains both signed and unsigned types,
the extended integer type may be signed or unsigned.
If an \grammarterm{integer-literal}
cannot be represented by any of the allowed types,
the program is ill-formed.
\begin{note}
An \grammarterm{integer-literal} with a \tcode{z} or \tcode{Z} suffix
is ill-formed if it cannot be represented by \tcode{std::size_t}.
\end{note}

\rSec2[lex.ccon]{Character literals}

\indextext{literal!character}%
\begin{bnf}
\nontermdef{character-literal}\br
    \opt{encoding-prefix} \terminal{'} c-char-sequence \terminal{'}
\end{bnf}

\begin{bnf}
\nontermdef{encoding-prefix} \textnormal{one of}\br
    \terminal{u8}\quad\terminal{u}\quad\terminal{U}\quad\terminal{L}
\end{bnf}

\begin{bnf}
\nontermdef{c-char-sequence}\br
    c-char \opt{c-char-sequence}
\end{bnf}

\begin{bnf}
\nontermdef{c-char}\br
    basic-c-char\br
    escape-sequence\br
    universal-character-name
\end{bnf}

\begin{bnf}
\nontermdef{basic-c-char}\br
    \textnormal{any member of the translation character set except the \unicode{0027}{apostrophe},}\br
    \bnfindent\textnormal{\unicode{005c}{reverse solidus}, or new-line character}
\end{bnf}

\begin{bnf}
\nontermdef{escape-sequence}\br
    simple-escape-sequence\br
    numeric-escape-sequence\br
    conditional-escape-sequence
\end{bnf}

\begin{bnf}
\nontermdef{simple-escape-sequence}\br
    \terminal{\textbackslash} simple-escape-sequence-char
\end{bnf}

\begin{bnf}
\nontermdef{simple-escape-sequence-char} \textnormal{one of}\br
    \terminal{'  "  ?  \textbackslash{} a  b  f  n  r  t  v}
\end{bnf}

\begin{bnf}
\nontermdef{numeric-escape-sequence}\br
    octal-escape-sequence\br
    hexadecimal-escape-sequence
\end{bnf}

\begin{bnf}
\nontermdef{simple-octal-digit-sequence}\br
    octal-digit \opt{simple-octal-digit-sequence}
\end{bnf}

\begin{bnf}
\nontermdef{octal-escape-sequence}\br
    \terminal{\textbackslash} octal-digit\br
    \terminal{\textbackslash} octal-digit octal-digit\br
    \terminal{\textbackslash} octal-digit octal-digit octal-digit\br
    \terminal{\textbackslash o\{} simple-octal-digit-sequence \terminal{\}}
\end{bnf}

\begin{bnf}
\nontermdef{hexadecimal-escape-sequence}\br
    \terminal{\textbackslash x} simple-hexadecimal-digit-sequence\br
    \terminal{\textbackslash x\{} simple-hexadecimal-digit-sequence \terminal{\}}
\end{bnf}

\begin{bnf}
\nontermdef{conditional-escape-sequence}\br
    \terminal{\textbackslash} conditional-escape-sequence-char
\end{bnf}

\begin{bnf}
\nontermdef{conditional-escape-sequence-char}\br
    \textnormal{any member of the basic character set that is not an} octal-digit\textnormal{, a} simple-escape-sequence-char\textnormal{, or the characters \terminal{N}, \terminal{o}, \terminal{u}, \terminal{U}, or \terminal{x}}
\end{bnf}

\pnum
\indextext{literal!character}%
\indextext{literal!\idxcode{char8_t}}%
\indextext{literal!\idxcode{char16_t}}%
\indextext{literal!\idxcode{char32_t}}%
\indextext{literal!type of character}%
\indextext{type!\idxcode{char8_t}}%
\indextext{type!\idxcode{char16_t}}%
\indextext{type!\idxcode{char32_t}}%
\indextext{wide-character}%
\indextext{type!\idxcode{wchar_t}}%
A \defnadj{multicharacter}{literal} is a \grammarterm{character-literal}
whose \grammarterm{c-char-sequence} consists of
more than one \grammarterm{c-char}.
A multicharacter literal shall not have an \grammarterm{encoding-prefix}.
If a multicharacter literal contains a \grammarterm{c-char}
that is not encodable as a single code unit in the ordinary literal encoding,
the program is ill-formed.
Multicharacter literals are conditionally-supported.

\pnum
The kind of a \grammarterm{character-literal},
its type, and its associated character encoding\iref{lex.charset}
are determined by
its \grammarterm{encoding-prefix} and its \grammarterm{c-char-sequence}
as defined by \tref{lex.ccon.literal}.

\begin{floattable}{Character literals}{lex.ccon.literal}
{l|l|l|l|l}
\topline
\lhdr{Encoding} & \chdr{Kind} & \chdr{Type} & \chdr{Associated char-} & \rhdr{Example} \\
\lhdr{prefix} & \chdr{} & \chdr{} & \chdr{acter encoding} & \\
\capsep
none &
\defnx{ordinary character literal}{literal!character!ordinary} &
\keyword{char} &
ordinary literal &
\tcode{'v'} \\ \cline{2-3}\cline{5-5}
 &
multicharacter literal &
\keyword{int} &
encoding &
\tcode{'abcd'} \\ \hline
\tcode{L} &
\defnx{wide character literal}{literal!character!wide} &
\keyword{wchar_t} &
wide literal &
\tcode{L'w'} \\
 & & & encoding & \\ \hline
\tcode{u8} &
\defnx{UTF-8 character literal}{literal!character!UTF-8} &
\keyword{char8_t} &
UTF-8 &
\tcode{u8'x'} \\ \hline
\tcode{u} &
\defnx{UTF-16 character literal}{literal!character!UTF-16} &
\keyword{char16_t} &
UTF-16 &
\tcode{u'y'} \\ \hline
\tcode{U} &
\defnx{UTF-32 character literal}{literal!character!UTF-32} &
\keyword{char32_t} &
UTF-32 &
\tcode{U'z'} \\
\end{floattable}

\pnum
In translation phase 4,
the value of a \grammarterm{character-literal} is determined
using the range of representable values
of the \grammarterm{character-literal}'s type in translation phase 7.
A multicharacter literal has an
\impldef{value of non-encodable character literal or multicharacter literal}
value.
The value of any other kind of \grammarterm{character-literal}
is determined as follows:
\begin{itemize}
\item
A \grammarterm{character-literal} with
a \grammarterm{c-char-sequence} consisting of a single
\grammarterm{basic-c-char},
\grammarterm{simple-escape-sequence}, or
\grammarterm{universal-character-name}
is the code unit value of the specified character
as encoded in the literal's associated character encoding.
If the specified character lacks
representation in the literal's associated character encoding or
if it cannot be encoded as a single code unit,
then the program is ill-formed.
\item
A \grammarterm{character-literal} with
a \grammarterm{c-char-sequence} consisting of
a single \grammarterm{numeric-escape-sequence}
has a value as follows:
\begin{itemize}
\item
Let $v$ be the integer value represented by
the octal number comprising
the sequence of \grammarterm{octal-digit}{s} in
an \grammarterm{octal-escape-sequence} or by
the hexadecimal number comprising
the sequence of \grammarterm{hexadecimal-digit}{s} in
a \grammarterm{hexadecimal-escape-sequence}.
\item
If $v$ does not exceed
the range of representable values of the \grammarterm{character-literal}'s type,
then the value is $v$.
\item
Otherwise,
if the \grammarterm{character-literal}'s \grammarterm{encoding-prefix}
is absent or \tcode{L}, and
$v$ does not exceed the range of representable values of the corresponding unsigned type for the underlying type of the \grammarterm{character-literal}'s type,
then the value is the unique value of the \grammarterm{character-literal}'s type \tcode{T} that is congruent to $v$ modulo $2^N$, where $N$ is the width of \tcode{T}.
\item
Otherwise, the program is ill-formed.
\end{itemize}
\item
A \grammarterm{character-literal} with
a \grammarterm{c-char-sequence} consisting of
a single \grammarterm{conditional-escape-sequence}
is conditionally-supported and
has an \impldef{value of \grammarterm{conditional-escape-sequence}} value.
\end{itemize}

\pnum
\indextext{backslash character}%
\indextext{\idxcode{\textbackslash}|see{backslash character}}%
\indextext{escape character|see{backslash character}}%
The character specified by a \grammarterm{simple-escape-sequence}
is specified in \tref{lex.ccon.esc}.
\begin{note}
Using an escape sequence for a question mark
is supported for compatibility with \CppXIV{} and C.
\end{note}

\begin{floattable}{Simple escape sequences}{lex.ccon.esc}
{lll}
\topline
\lhdrx{2}{character} &  \rhdr{\grammarterm{simple-escape-sequence}} \\ \capsep
\ucode{000a} & \uname{line feed}            & \tcode{\textbackslash n} \\
\ucode{0009} & \uname{character tabulation} & \tcode{\textbackslash t} \\
\ucode{000b} & \uname{line tabulation}      & \tcode{\textbackslash v} \\
\ucode{0008} & \uname{backspace}            & \tcode{\textbackslash b} \\
\ucode{000d} & \uname{carriage return}      & \tcode{\textbackslash r} \\
\ucode{000c} & \uname{form feed}            & \tcode{\textbackslash f} \\
\ucode{0007} & \uname{alert}                & \tcode{\textbackslash a} \\
\ucode{005c} & \uname{reverse solidus}      & \tcode{\textbackslash\textbackslash} \\
\ucode{003f} & \uname{question mark}        & \tcode{\textbackslash ?} \\
\ucode{0027} & \uname{apostrophe}           & \tcode{\textbackslash '} \\
\ucode{0022} & \uname{quotation mark}       & \tcode{\textbackslash "} \\
\end{floattable}

\rSec2[lex.fcon]{Floating-point literals}

\indextext{literal!floating-point}%
\begin{bnf}
\nontermdef{floating-point-literal}\br
    decimal-floating-point-literal\br
    hexadecimal-floating-point-literal
\end{bnf}

\begin{bnf}
\nontermdef{decimal-floating-point-literal}\br
    fractional-constant \opt{exponent-part} \opt{floating-point-suffix}\br
    digit-sequence exponent-part \opt{floating-point-suffix}
\end{bnf}

\begin{bnf}
\nontermdef{hexadecimal-floating-point-literal}\br
    hexadecimal-prefix hexadecimal-fractional-constant binary-exponent-part \opt{floating-point-suffix}\br
    hexadecimal-prefix hexadecimal-digit-sequence binary-exponent-part \opt{floating-point-suffix}
\end{bnf}

\begin{bnf}
\nontermdef{fractional-constant}\br
    \opt{digit-sequence} \terminal{.} digit-sequence\br
    digit-sequence \terminal{.}
\end{bnf}

\begin{bnf}
\nontermdef{hexadecimal-fractional-constant}\br
    \opt{hexadecimal-digit-sequence} \terminal{.} hexadecimal-digit-sequence\br
    hexadecimal-digit-sequence \terminal{.}
\end{bnf}

\begin{bnf}
\nontermdef{exponent-part}\br
    \terminal{e} \opt{sign} digit-sequence\br
    \terminal{E} \opt{sign} digit-sequence
\end{bnf}

\begin{bnf}
\nontermdef{binary-exponent-part}\br
    \terminal{p} \opt{sign} digit-sequence\br
    \terminal{P} \opt{sign} digit-sequence
\end{bnf}

\begin{bnf}
\nontermdef{sign} \textnormal{one of}\br
    \terminal{+  -}
\end{bnf}

\begin{bnf}
\nontermdef{digit-sequence}\br
    digit\br
    digit-sequence \opt{\terminal{'}} digit
\end{bnf}

\begin{bnf}
\nontermdef{floating-point-suffix} \textnormal{one of}\br
    \terminal{f  l  f16  f32  f64  f128  bf16  F  L  F16  F32  F64  F128  BF16}
\end{bnf}

\pnum
\indextext{literal!type of floating-point}%
\indextext{literal!\idxcode{float}}%
\indextext{suffix!\idxcode{F}}%
\indextext{suffix!\idxcode{f}}%
\indextext{suffix!\idxcode{L}}%
\indextext{suffix!\idxcode{l}}%
\indextext{literal!\idxcode{long double}}%
The type of
a \grammarterm{floating-point-literal}\iref{basic.fundamental,basic.extended.fp}
is determined by
its \grammarterm{floating-point-suffix} as specified in \tref{lex.fcon.type}.
\begin{note}
The floating-point suffixes
\tcode{f16}, \tcode{f32}, \tcode{f64}, \tcode{f128}, \tcode{bf16},
\tcode{F16}, \tcode{F32}, \tcode{F64}, \tcode{F128}, and \tcode{BF16}
are conditionally-supported. See \ref{basic.extended.fp}.
\end{note}
\begin{simpletypetable}
{Types of \grammarterm{floating-point-literal}{s}}
{lex.fcon.type}
{ll}
\topline
\lhdr{\grammarterm{floating-point-suffix}} & \rhdr{type} \\ \capsep
none & \keyword{double} \\
\tcode{f} or \tcode{F} & \keyword{float} \\
\tcode{l} or \tcode{L} & \keyword{long} \keyword{double} \\
\tcode{f16} or \tcode{F16} & \tcode{std::float16_t} \\
\tcode{f32} or \tcode{F32} & \tcode{std::float32_t} \\
\tcode{f64} or \tcode{F64} & \tcode{std::float64_t} \\
\tcode{f128} or \tcode{F128} & \tcode{std::float128_t} \\
\tcode{bf16} or \tcode{BF16} & \tcode{std::bfloat16_t} \\
\end{simpletypetable}

\pnum
\indextext{literal!floating-point}%
The \defn{significand} of a \grammarterm{floating-point-literal}
is the \grammarterm{fractional-constant} or \grammarterm{digit-sequence}
of a \grammarterm{decimal-floating-point-literal}
or the \grammarterm{hexadecimal-fractional-constant}
or \grammarterm{hexadecimal-digit-sequence}
of a \grammarterm{hexadecimal-floating-point-literal}.
In the significand,
the sequence of \grammarterm{digit}s or \grammarterm{hexadecimal-digit}s
and optional period are interpreted as a base $N$ real number $s$,
where $N$ is 10 for a \grammarterm{decimal-floating-point-literal} and
16 for a \grammarterm{hexadecimal-floating-point-literal}.
\begin{note}
Any optional separating single quotes are ignored when determining the value.
\end{note}
If an \grammarterm{exponent-part} or \grammarterm{binary-exponent-part}
is present,
the exponent $e$ of the \grammarterm{floating-point-literal}
is the result of interpreting
the sequence of an optional \grammarterm{sign} and the \grammarterm{digit}s
as a base 10 integer.
Otherwise, the exponent $e$ is 0.
The scaled value of the literal is
$s \times 10^e$ for a \grammarterm{decimal-floating-point-literal} and
$s \times 2^e$ for a \grammarterm{hexadecimal-floating-point-literal}.
\begin{example}
The \grammarterm{floating-point-literal}{s}
\tcode{49.625} and \tcode{0xC.68p+2} have the same value.
The \grammarterm{floating-point-literal}{s}
\tcode{1.602'176'565e-19} and \tcode{1.602176565e-19}
have the same value.
\end{example}

\pnum
If the scaled value is not in the range of representable
values for its type, the program is ill-formed.
Otherwise, the value of a \grammarterm{floating-point-literal}
is the scaled value if representable,
else the larger or smaller representable value nearest the scaled value,
chosen in an \impldef{choice of larger or smaller value of
\grammarterm{floating-point-literal}} manner.

\rSec2[lex.string]{String literals}

\indextext{literal!string}%
\begin{bnf}
\nontermdef{string-literal}\br
    \opt{encoding-prefix} \terminal{"} \opt{s-char-sequence} \terminal{"}\br
    \opt{encoding-prefix} \terminal{R} raw-string
\end{bnf}

\begin{bnf}
\nontermdef{s-char-sequence}\br
    s-char \opt{s-char-sequence}
\end{bnf}

\begin{bnf}
\nontermdef{s-char}\br
    basic-s-char\br
    escape-sequence\br
    universal-character-name
\end{bnf}

\begin{bnf}
\nontermdef{basic-s-char}\br
    \textnormal{any member of the translation character set except the \unicode{0022}{quotation mark},}\br
    \bnfindent\textnormal{\unicode{005c}{reverse solidus}, or new-line character}
\end{bnf}

\begin{bnf}
\nontermdef{raw-string}\br
    \terminal{"} \opt{d-char-sequence} \terminal{(} \opt{r-char-sequence} \terminal{)} \opt{d-char-sequence} \terminal{"}
\end{bnf}

\begin{bnf}
\nontermdef{r-char-sequence}\br
    r-char \opt{r-char-sequence}
\end{bnf}

\begin{bnf}
\nontermdef{r-char}\br
    \textnormal{any member of the translation character set, except a \unicode{0029}{right parenthesis} followed by}\br
    \bnfindent\textnormal{the initial \grammarterm{d-char-sequence} (which may be empty) followed by a \unicode{0022}{quotation mark}}
\end{bnf}

\begin{bnf}
\nontermdef{d-char-sequence}\br
    d-char \opt{d-char-sequence}
\end{bnf}

\begin{bnf}
\nontermdef{d-char}\br
    \textnormal{any member of the basic character set except:}\br
    \bnfindent\textnormal{\unicode{0020}{space}, \unicode{0028}{left parenthesis}, \unicode{0029}{right parenthesis}, \unicode{005c}{reverse solidus},}\br
    \bnfindent\textnormal{\unicode{0009}{character tabulation}, \unicode{000b}{line tabulation}, \unicode{000c}{form feed}, and new-line}
\end{bnf}

\pnum
\indextext{literal!string}%
\indextext{character string}%
\indextext{string!type of}%
\indextext{type!\idxcode{wchar_t}}%
\indextext{prefix!\idxcode{L}}%
\indextext{literal!string!\idxcode{char16_t}}%
\indextext{type!\idxcode{char16_t}}%
\indextext{literal!string!\idxcode{char32_t}}%
\indextext{type!\idxcode{char32_t}}%
The kind of a \grammarterm{string-literal},
its type, and
its associated character encoding\iref{lex.charset}
are determined by its encoding prefix and sequence of
\grammarterm{s-char}s or \grammarterm{r-char}s
as defined by \tref{lex.string.literal}
where $n$ is the number of encoded code units
that would result from an evaluation of the \grammarterm{string-literal}
(see below).

\begin{floattable}{String literals}{lex.string.literal}
{llp{2.6cm}p{2.3cm}p{4.7cm}}
\topline
\lhdr{Enco-} & \chdr{Kind} & \chdr{Type} & \chdr{Associated} & \rhdr{Examples} \\
\lhdr{ding}   & \chdr{} & \chdr{} & \chdr{character}  & \rhdr{} \\
\lhdr{prefix}         & \chdr{} & \chdr{} & \chdr{encoding}   & \rhdr{} \\
\capsep
none &
\defnx{ordinary string literal}{literal!string!ordinary} &
array of $n$\newline \tcode{\keyword{const} \keyword{char}} &
ordinary literal encoding &
\tcode{"ordinary string"}\newline
\tcode{R"(ordinary raw string)"} \\
\tcode{L} &
\defnx{wide string literal}{literal!string!wide} &
array of $n$\newline \tcode{\keyword{const} \keyword{wchar_t}} &
wide literal\newline encoding &
\tcode{L"wide string"}\newline
\tcode{LR"w(wide raw string)w"} \\
\tcode{u8} &
\defnx{UTF-8 string literal}{literal!string!UTF-8} &
array of $n$\newline \tcode{\keyword{const} \keyword{char8_t}} &
UTF-8 &
\tcode{u8"UTF-8 string"}\newline
\tcode{u8R"x(UTF-8 raw string)x"} \\
\tcode{u} &
\defnx{UTF-16 string literal}{literal!string!UTF-16} &
array of $n$\newline \tcode{\keyword{const} \keyword{char16_t}} &
UTF-16 &
\tcode{u"UTF-16 string"}\newline
\tcode{uR"y(UTF-16 raw string)y"} \\
\tcode{U} &
\defnx{UTF-32 string literal}{literal!string!UTF-32} &
array of $n$\newline \tcode{\keyword{const} \keyword{char32_t}} &
UTF-32 &
\tcode{U"UTF-32 string"}\newline
\tcode{UR"z(UTF-32 raw string)z"} \\
\end{floattable}

\pnum
\indextext{literal!string!raw}%
A \grammarterm{string-literal} that has an \tcode{R}
\indextext{prefix!\idxcode{R}}%
in the prefix is a \defn{raw string literal}. The
\grammarterm{d-char-sequence} serves as a delimiter. The terminating
\grammarterm{d-char-sequence} of a \grammarterm{raw-string} is the same sequence of
characters as the initial \grammarterm{d-char-sequence}. A \grammarterm{d-char-sequence}
shall consist of at most 16 characters.

\pnum
\begin{note}
The characters \tcode{'('} and \tcode{')'} can appear in a
\grammarterm{raw-string}. Thus, \tcode{R"delimiter((a|b))delimiter"} is equivalent to
\tcode{"(a|b)"}.
\end{note}

\pnum
\begin{note}
A source-file new-line in a raw string literal results in a new-line in the
resulting execution string literal. Assuming no
whitespace at the beginning of lines in the following example, the assert will succeed:
\begin{codeblock}
const char* p = R"(a\
b
c)";
assert(std::strcmp(p, "a\\\nb\nc") == 0);
\end{codeblock}
\end{note}

\pnum
\begin{example}
The raw string
\begin{codeblock}
R"a(
)\
a"
)a"
\end{codeblock}
is equivalent to \tcode{"\textbackslash n)\textbackslash \textbackslash \textbackslash na\textbackslash"\textbackslash n"}. The raw string
\begin{codeblock}
R"(x = "\"y\"")"
\end{codeblock}
is equivalent to \tcode{"x = \textbackslash "\textbackslash\textbackslash\textbackslash "y\textbackslash\textbackslash\textbackslash "\textbackslash ""}.
\end{example}

\pnum
\indextext{literal!narrow-character}%
Ordinary string literals and UTF-8 string literals are
also referred to as \defnx{narrow string literals}{literal!string!narrow}.

\pnum
\indextext{concatenation!string}%
The \grammarterm{string-literal}{s} in
any sequence of adjacent \grammarterm{string-literal}{s}
shall have at most one unique \grammarterm{encoding-prefix} among them.
The common \grammarterm{encoding-prefix} of the sequence is
that \grammarterm{encoding-prefix}, if any.
\begin{note}
A \grammarterm{string-literal}'s rawness has
no effect on the determination of the common \grammarterm{encoding-prefix}.
\end{note}

\pnum
In translation phase 6\iref{lex.phases},
adjacent \grammarterm{string-literal}s are concatenated.
The lexical structure and grouping of
the contents of the individual \grammarterm{string-literal}s is retained.
\begin{example}
\begin{codeblock}
"\xA" "B"
\end{codeblock}
represents
the code unit \tcode{'\textbackslash xA'} and the character \tcode{'B'}
after concatenation
(and not the single code unit \tcode{'\textbackslash xAB'}).
Similarly,
\begin{codeblock}
R"(\u00)" "41"
\end{codeblock}
represents six characters,
starting with a backslash and ending with the digit \tcode{1}
(and not the single character \tcode{'A'}
specified by a \grammarterm{universal-character-name}).

\tref{lex.string.concat} has some examples of valid concatenations.
\end{example}

\begin{floattable}{String literal concatenations}{lex.string.concat}
{lll|lll|lll}
\topline
\multicolumn{2}{|c}{Source} &
Means &
\multicolumn{2}{c}{Source} &
Means &
\multicolumn{2}{c}{Source} &
Means \\
\tcode{u"a"} & \tcode{u"b"} & \tcode{u"ab"} &
\tcode{U"a"} & \tcode{U"b"} & \tcode{U"ab"} &
\tcode{L"a"} & \tcode{L"b"} & \tcode{L"ab"} \\
\tcode{u"a"} & \tcode{"b"}  & \tcode{u"ab"} &
\tcode{U"a"} & \tcode{"b"}  & \tcode{U"ab"} &
\tcode{L"a"} & \tcode{"b"}  & \tcode{L"ab"} \\
\tcode{"a"}  & \tcode{u"b"} & \tcode{u"ab"} &
\tcode{"a"}  & \tcode{U"b"} & \tcode{U"ab"} &
\tcode{"a"}  & \tcode{L"b"} & \tcode{L"ab"} \\
\end{floattable}

\pnum
Evaluating a \grammarterm{string-literal} results in a string literal object
with static storage duration\iref{basic.stc}.
\begin{note}
String literal objects are potentially non-unique\iref{intro.object}.
Whether successive evaluations of a
\grammarterm{string-literal} yield the same or a different object is
unspecified.
\end{note}
\begin{note}
\indextext{literal!string!undefined change to}%
The effect of attempting to modify a string literal object is undefined.
\end{note}

\pnum
\indextext{\idxcode{0}!string terminator}%
\indextext{\idxcode{0}!null character|see {character, null}}%
String literal objects are initialized with
the sequence of code unit values
corresponding to the \grammarterm{string-literal}'s sequence of
\grammarterm{s-char}s (originally from non-raw string literals) and
\grammarterm{r-char}s (originally from raw string literals),
plus a terminating \unicode{0000}{null} character,
in order as follows:
\begin{itemize}
\item
The sequence of characters denoted by each contiguous sequence of
\grammarterm{basic-s-char}s,
\grammarterm{r-char}s,
\grammarterm{simple-escape-sequence}s\iref{lex.ccon}, and
\grammarterm{universal-character-name}s\iref{lex.charset}
is encoded to a code unit sequence
using the \grammarterm{string-literal}'s associated character encoding.
If a character lacks representation in the associated character encoding,
then the program is ill-formed.
\begin{note}
No character lacks representation in any Unicode encoding form.
\end{note}
When encoding a stateful character encoding,
implementations should encode the first such sequence
beginning with the initial encoding state and
encode subsequent sequences
beginning with the final encoding state of the prior sequence.
\begin{note}
The encoded code unit sequence can differ from
the sequence of code units that would be obtained by
encoding each character independently.
\end{note}
\item
Each \grammarterm{numeric-escape-sequence}\iref{lex.ccon}
contributes a single code unit with a value as follows:
\begin{itemize}
\item
Let $v$ be the integer value represented by
the octal number comprising
the sequence of \grammarterm{octal-digit}{s} in
an \grammarterm{octal-escape-sequence} or by
the hexadecimal number comprising
the sequence of \grammarterm{hexadecimal-digit}{s} in
a \grammarterm{hexadecimal-escape-sequence}.
\item
If $v$ does not exceed the range of representable values of
the \grammarterm{string-literal}'s array element type,
then the value is $v$.
\item
Otherwise,
if the \grammarterm{string-literal}'s \grammarterm{encoding-prefix}
is absent or \tcode{L}, and
$v$ does not exceed the range of representable values of
the corresponding unsigned type for the underlying type of
the \grammarterm{string-literal}'s array element type,
then the value is the unique value of
the \grammarterm{string-literal}'s array element type \tcode{T}
that is congruent to $v$ modulo $2^N$, where $N$ is the width of \tcode{T}.
\item
Otherwise, the program is ill-formed.
\end{itemize}
When encoding a stateful character encoding,
these sequences should have no effect on encoding state.
\item
Each \grammarterm{conditional-escape-sequence}\iref{lex.ccon}
contributes an
\impldef{code unit sequence for \grammarterm{conditional-escape-sequence}}
code unit sequence.
When encoding a stateful character encoding,
it is
\impldef{effect of \grammarterm{conditional-escape-sequence} on encoding state}
what effect these sequences have on encoding state.
\end{itemize}

\rSec2[lex.string.uneval]{Unevaluated strings}

\begin{bnf}
\nontermdef{unevaluated-string}\br
    string-literal
\end{bnf}

\pnum
An \grammarterm{unevaluated-string} shall have no \grammarterm{encoding-prefix}.

\pnum
Each \grammarterm{universal-character-name} and each \grammarterm{simple-escape-sequence} in an \grammarterm{unevaluated-string} is
replaced by the member of the translation character set it denotes.
An \grammarterm{unevaluated-string} that contains
a \grammarterm{numeric-escape-sequence} or
a \grammarterm{conditional-escape-sequence}
is ill-formed.

\pnum
An \grammarterm{unevaluated-string} is never evaluated and
its interpretation depends on the context in which it appears.

\rSec2[lex.bool]{Boolean literals}

\indextext{literal!boolean}%
\begin{bnf}
\nontermdef{boolean-literal}\br
    \keyword{false}\br
    \keyword{true}
\end{bnf}

\pnum
\indextext{Boolean literal}%
The Boolean literals are the keywords \tcode{false} and \tcode{true}.
Such literals have type \tcode{bool}.

\rSec2[lex.nullptr]{Pointer literals}

\indextext{literal!pointer}%
\begin{bnf}
\nontermdef{pointer-literal}\br
    \keyword{nullptr}
\end{bnf}

\pnum
The pointer literal is the keyword \keyword{nullptr}. It has type
\tcode{std::nullptr_t}.
\begin{note}
\tcode{std::nullptr_t} is a distinct type that is neither a pointer type nor a pointer-to-member type;
rather, a prvalue of this type is a null pointer constant and can be
converted to a null pointer value or null member pointer value. See~\ref{conv.ptr}
and~\ref{conv.mem}.
\end{note}

\rSec2[lex.ext]{User-defined literals}

\indextext{literal!user-defined}%
\begin{bnf}
\nontermdef{user-defined-literal}\br
    user-defined-integer-literal\br
    user-defined-floating-point-literal\br
    user-defined-string-literal\br
    user-defined-character-literal
\end{bnf}

\begin{bnf}
\nontermdef{user-defined-integer-literal}\br
    decimal-literal ud-suffix\br
    octal-literal ud-suffix\br
    hexadecimal-literal ud-suffix\br
    binary-literal ud-suffix
\end{bnf}

\begin{bnf}
\nontermdef{user-defined-floating-point-literal}\br
    fractional-constant \opt{exponent-part} ud-suffix\br
    digit-sequence exponent-part ud-suffix\br
    hexadecimal-prefix hexadecimal-fractional-constant binary-exponent-part ud-suffix\br
    hexadecimal-prefix hexadecimal-digit-sequence binary-exponent-part ud-suffix
\end{bnf}

\begin{bnf}
\nontermdef{user-defined-string-literal}\br
    string-literal ud-suffix
\end{bnf}

\begin{bnf}
\nontermdef{user-defined-character-literal}\br
    character-literal ud-suffix
\end{bnf}

\begin{bnf}
\nontermdef{ud-suffix}\br
    identifier
\end{bnf}

\pnum
If a token matches both \grammarterm{user-defined-literal} and another \grammarterm{literal} kind, it
is treated as the latter.
\begin{example}
\tcode{123_km}
is a \grammarterm{user-defined-literal}, but \tcode{12LL} is an
\grammarterm{integer-literal}.
\end{example}
The syntactic non-terminal preceding the \grammarterm{ud-suffix} in a
\grammarterm{user-defined-literal} is taken to be the longest sequence of
characters that could match that non-terminal.

\pnum
A \grammarterm{user-defined-literal} is treated as a call to a literal operator or
literal operator template\iref{over.literal}.
To determine the form of this call for
a given \grammarterm{user-defined-literal} \placeholder{L}
with \grammarterm{ud-suffix} \placeholder{X},
first let \placeholder{S} be the set of declarations
found by unqualified lookup for the \grammarterm{literal-operator-id}
whose literal suffix identifier is \placeholder{X}\iref{basic.lookup.unqual}.
\placeholder{S} shall not be empty.

\pnum
If \placeholder{L} is a \grammarterm{user-defined-integer-literal}, let \placeholder{n} be the literal
without its \grammarterm{ud-suffix}. If \placeholder{S} contains a literal operator with
parameter type \tcode{unsigned long long}, the literal \placeholder{L} is treated as a call of
the form
\begin{codeblock}
operator ""@\placeholder{X}@(@\placeholder{n}@ULL)
\end{codeblock}
Otherwise, \placeholder{S} shall contain a raw literal operator
or a numeric literal operator template\iref{over.literal} but not both.
If \placeholder{S} contains a raw literal operator,
the literal \placeholder{L} is treated as a call of the form
\begin{codeblock}
operator ""@\placeholder{X}@("@\placeholder{n}@")
\end{codeblock}
Otherwise (\placeholder{S} contains a numeric literal operator template),
\placeholder{L} is treated as a call of the form
\begin{codeblock}
operator ""@\placeholder{X}@<'@$c_1$@', '@$c_2$@', ... '@$c_k$@'>()
\end{codeblock}
where \placeholder{n} is the source character sequence $c_1c_2...c_k$.
\begin{note}
The sequence
$c_1c_2...c_k$ can only contain characters from the basic character set.
\end{note}

\pnum
If \placeholder{L} is a \grammarterm{user-defined-floating-point-literal}, let \placeholder{f} be the
literal without its \grammarterm{ud-suffix}. If \placeholder{S} contains a literal operator
with parameter type \tcode{long double}, the literal \placeholder{L} is treated as a call of
the form
\begin{codeblock}
operator ""@\placeholder{X}@(@\placeholder{f}@L)
\end{codeblock}
Otherwise, \placeholder{S} shall contain a raw literal operator
or a numeric literal operator template\iref{over.literal} but not both.
If \placeholder{S} contains a raw literal operator,
the \grammarterm{literal} \placeholder{L} is treated as a call of the form
\begin{codeblock}
operator ""@\placeholder{X}@("@\placeholder{f}@")
\end{codeblock}
Otherwise (\placeholder{S} contains a numeric literal operator template),
\placeholder{L} is treated as a call of the form
\begin{codeblock}
operator ""@\placeholder{X}@<'@$c_1$@', '@$c_2$@', ... '@$c_k$@'>()
\end{codeblock}
where \placeholder{f} is the source character sequence $c_1c_2...c_k$.
\begin{note}
The sequence
$c_1c_2...c_k$ can only contain characters from the basic character set.
\end{note}

\pnum
If \placeholder{L} is a \grammarterm{user-defined-string-literal},
let \placeholder{str} be the literal without its \grammarterm{ud-suffix}
and let \placeholder{len} be the number of code units in \placeholder{str}
(i.e., its length excluding the terminating null character).
If \placeholder{S} contains a literal operator template with
a constant template parameter for which \placeholder{str} is
a well-formed \grammarterm{template-argument},
the literal \placeholder{L} is treated as a call of the form
\begin{codeblock}
operator ""@\placeholder{X}@<@\placeholder{str}{}@>()
\end{codeblock}
Otherwise, the literal \placeholder{L} is treated as a call of the form
\begin{codeblock}
operator ""@\placeholder{X}@(@\placeholder{str}{}@, @\placeholder{len}{}@)
\end{codeblock}

\pnum
If \placeholder{L} is a \grammarterm{user-defined-character-literal}, let \placeholder{ch} be the
literal without its \grammarterm{ud-suffix}.
\placeholder{S} shall contain a literal operator\iref{over.literal} whose only parameter has
the type of \placeholder{ch} and the
literal \placeholder{L} is treated as a call
of the form
\begin{codeblock}
operator ""@\placeholder{X}@(@\placeholder{ch}{}@)
\end{codeblock}

\pnum
\begin{example}
\begin{codeblock}
long double operator ""_w(long double);
std::string operator ""_w(const char16_t*, std::size_t);
unsigned operator ""_w(const char*);
int main() {
  1.2_w;            // calls \tcode{operator ""_w(1.2L)}
  u"one"_w;         // calls \tcode{operator ""_w(u"one", 3)}
  12_w;             // calls \tcode{operator ""_w("12")}
  "two"_w;          // error: no applicable literal operator
}
\end{codeblock}
\end{example}

\pnum
In translation phase 6\iref{lex.phases}, adjacent \grammarterm{string-literal}s are concatenated and
\grammarterm{user-defined-string-literal}{s} are considered \grammarterm{string-literal}s for that
purpose. During concatenation, \grammarterm{ud-suffix}{es} are removed and ignored and
the concatenation process occurs as described in~\ref{lex.string}. At the end of phase
6, if a \grammarterm{string-literal} is the result of a concatenation involving at least one
\grammarterm{user-defined-string-literal}, all the participating
\grammarterm{user-defined-string-literal}{s} shall have the same \grammarterm{ud-suffix}
and that suffix is applied to the result of the concatenation.

\pnum
\begin{example}
\begin{codeblock}
int main() {
  L"A" "B" "C"_x;   // OK, same as \tcode{L"ABC"_x}
  "P"_x "Q" "R"_y;  // error: two different \grammarterm{ud-suffix}{es}
}
\end{codeblock}
\end{example}
\indextext{literal|)}%
\indextext{conventions!lexical|)}