diff --git a/.github/workflows/test_and_build.yaml b/.github/workflows/test_and_build.yaml index 5268a2b..8bea027 100644 --- a/.github/workflows/test_and_build.yaml +++ b/.github/workflows/test_and_build.yaml @@ -1,6 +1,6 @@ name: Tests -on: [push] +on: [push, pull_request] jobs: build: @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12.0"] + python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] steps: - uses: actions/checkout@v3 diff --git a/CHANGELOG.md b/CHANGELOG.md index 7321afc..76b9112 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,35 @@ +In Development +-------------- + +* Python 3.8 and 3.9 are no longer supported. + +2.3.0 (August 26, 2025) +----------------------- + +* The package name is changed from using an underscore (email_validator) to a dash (email-validator) to match PyPi's normalized package name. +* The library no longer checks that the local part is at most 64 characters because a more careful reading of RFC 5321 indicates the limit is optional and such email addresses have been found in the wild. However the check can be restored using a new `strict=True` parameter, and the overall 254 character email address length limit is still in place. +* New EmailSyntaxError messages are used for some exiting syntax errors related to @-sign homoglyphs and invalid characters in internationalized domains. +* When using `allow_display_name=True`, display names are now returned with Unicode NFC normalization. +* TypeError is now raised if something other than str (or bytes) is passed as the email address. + +2.2.0 (June 20, 2024) +--------------------- + +* Email addresses with internationalized local parts could, with rare Unicode characters, be returned as valid but actually be invalid in their normalized form (returned in the `normalized` field). In particular, it is possible to get a normalized address with a ";" character, which is not valid and could change the interpretation of the address. Local parts now re-validated after Unicode NFC normalization to ensure that invalid characters cannot be injected into the normalized address and that characters with length-increasing NFC normalizations cannot cause a local part to exceed the maximum length after normalization. Thanks to khanh@calif.io from https://calif.io for reporting the issue. +* The length check for email addresses with internationalized local parts is now also applied to the original address string prior to Unicode NFC normalization, which may be longer and could exceed the maximum email address length, to protect callers who do not use the returned normalized address. +* Improved error message for IDNA domains that are too long or have invalid characters after Unicode normalization. +* A new option to parse `My Name ` strings, i.e. a display name plus an email address in angle brackets, is now available. It is off by default. +* Improvements to Python typing. +* Some additional tests added. + +2.1.2 (June 16, 2024) +--------------------- + +* The domain name length limit is corrected from 255 to 253 IDNA ASCII characters. I misread the RFCs. +* When a domain name has no MX record but does have an A or AAAA record, if none of the IP addresses in the response are globally reachable (i.e. not Private-Use, Loopback, etc.), the response is treated as if there was no A/AAAA response and the email address will fail the deliverability check. +* When a domain name has no MX record but does have an A or AAAA record, the mx field in the object returned by validate_email incorrectly held the IP addresses rather than the domain itself. +* Fixes in tests. + 2.1.1 (February 26, 2024) ------------------------- @@ -59,7 +91,7 @@ Version 1.2.1 (May 1, 2022) * example.com/net/org are removed from the special-use reserved domain names list so that they do not raise exceptions if check_deliverability is off. * Improved README. -Verison 1.2.0 (April 24, 2022) +Version 1.2.0 (April 24, 2022) ------------------------------ * Reject domains with NULL MX records (when deliverability checks diff --git a/Makefile b/Makefile index 7898e4f..57df9da 100644 --- a/Makefile +++ b/Makefile @@ -17,7 +17,7 @@ typing: .PHONY: test test: - PYTHONPATH=.:$PYTHONPATH pytest --cov=email_validator -k "not network" + PYTHONPATH=.:$$PYTHONPATH pytest --cov=email_validator -k "not network" .PHONY: testcov testcov: test diff --git a/README.md b/README.md index 921af3d..0d1f0eb 100644 --- a/README.md +++ b/README.md @@ -7,30 +7,31 @@ Python 3.8+ by [Joshua Tauberer](https://joshdata.me). This library validates that a string is of the form `name@example.com` and optionally checks that the domain name is set up to receive email. This is the sort of validation you would want when you are identifying -users by their email address like on a registration/login form (but not -necessarily for composing an email message, see below). +users by their email address like on a registration form. Key features: * Checks that an email address has the correct syntax --- great for - email-based registration/login forms or validing data. + email-based registration/login forms or validating data. * Gives friendly English error messages when validation fails that you can display to end-users. * Checks deliverability (optional): Does the domain name resolve? (You can override the default DNS resolver to add query caching.) -* Supports internationalized domain names and internationalized local parts. -* Rejects addresses with unsafe Unicode characters, obsolete email address - syntax that you'd find unexpected, special use domain names like - `@localhost`, and domains without a dot by default. This is an - opinionated library! +* Supports internationalized domain names (like `@ツ.life`), + internationalized local parts (like `ツ@example.com`), + and optionally parses display names (e.g. `"My Name" `). +* Rejects addresses with invalid or unsafe Unicode characters, + obsolete email address syntax that you'd find unexpected, + special use domain names like `@localhost`, + and domains without a dot by default. + This is an opinionated library! * Normalizes email addresses (important for internationalized and quoted-string addresses! see below). * Python type annotations are used. This is an opinionated library. You should definitely also consider using -the less-opinionated [pyIsEmail](https://github.com/michaelherold/pyIsEmail) and -[flanker](https://github.com/mailgun/flanker) if they are better for your -use case. +the less-opinionated [pyIsEmail](https://github.com/michaelherold/pyIsEmail) +if it works better for you. [![Build Status](https://github.com/JoshData/python-email-validator/actions/workflows/test_and_build.yaml/badge.svg)](https://github.com/JoshData/python-email-validator/actions/workflows/test_and_build.yaml) @@ -142,11 +143,14 @@ The `validate_email` function also accepts the following keyword arguments `allow_quoted_local=False`: Set to `True` to allow obscure and potentially problematic email addresses in which the part of the address before the @-sign contains spaces, @-signs, or other surprising characters when the local part is surrounded in quotes (so-called quoted-string local parts). In the object returned by `validate_email`, the normalized local part removes any unnecessary backslash-escaping and even removes the surrounding quotes if the address would be valid without them. You can also set `email_validator.ALLOW_QUOTED_LOCAL` to `True` to turn this on for all calls by default. -`allow_domain_literal=False`: Set to `True` to allow bracketed IPv4 and "IPv6:"-prefixd IPv6 addresses in the domain part of the email address. No deliverability checks are performed for these addresses. In the object returned by `validate_email`, the normalized domain will use the condensed IPv6 format, if applicable. The object's `domain_address` attribute will hold the parsed `ipaddress.IPv4Address` or `ipaddress.IPv6Address` object if applicable. You can also set `email_validator.ALLOW_DOMAIN_LITERAL` to `True` to turn this on for all calls by default. +`allow_domain_literal=False`: Set to `True` to allow bracketed IPv4 and "IPv6:"-prefixed IPv6 addresses in the domain part of the email address. No deliverability checks are performed for these addresses. In the object returned by `validate_email`, the normalized domain will use the condensed IPv6 format, if applicable. The object's `domain_address` attribute will hold the parsed `ipaddress.IPv4Address` or `ipaddress.IPv6Address` object if applicable. You can also set `email_validator.ALLOW_DOMAIN_LITERAL` to `True` to turn this on for all calls by default. + +`allow_display_name=False`: Set to `True` to allow a display name and bracketed address in the input string, like `My Name `. It's implemented in the spirit but not the letter of RFC 5322 3.4, so it may be stricter or more relaxed than what you want. The display name, if present, is provided in the returned object's `display_name` field after being unquoted and unescaped. You can also set `email_validator.ALLOW_DISPLAY_NAME` to `True` to turn this on for all calls by default. `allow_empty_local=False`: Set to `True` to allow an empty local part (i.e. `@example.com`), e.g. for validating Postfix aliases. +`strict=False`: Set to `True` to perform additional syntax checks (currently only a local part length check). This should be used by mail service providers at address creation to ensure email addresses meet broad compatibility requirements. ### DNS timeout and cache @@ -182,8 +186,12 @@ Internationalized email addresses The email protocol SMTP and the domain name system DNS have historically only allowed English (ASCII) characters in email addresses and domain names, respectively. Each has adapted to internationalization in a separate -way, creating two separate aspects to email address -internationalization. +way, creating two separate aspects to email address internationalization. + +(If your mail submission library doesn't support Unicode at all, then +immediately prior to mail submission you must replace the email address with +its ASCII-ized form. This library gives you back the ASCII-ized form in the +`ascii_email` field in the returned object.) ### Internationalized domain names (IDN) @@ -206,6 +214,19 @@ email addresses, only English letters, numbers, and some punctuation (`._!#$%&'^``*+-=~/?{|}`) are allowed. In internationalized email address local parts, a wider range of Unicode characters are allowed. +Email addresses with these non-ASCII characters require that your mail +submission library and all the mail servers along the route to the destination, +including your own outbound mail server, all support the +[SMTPUTF8 (RFC 6531)](https://tools.ietf.org/html/rfc6531) extension. +Support for SMTPUTF8 varies. If you know ahead of time that SMTPUTF8 is not +supported by your mail submission stack, then you must filter out addresses that +require SMTPUTF8 using the `allow_smtputf8=False` keyword argument (see above). +This will cause the validation function to raise a `EmailSyntaxError` if +delivery would require SMTPUTF8. If you do not set `allow_smtputf8=False`, +you can also check the value of the `smtputf8` field in the returned object. + +### Unsafe Unicode characters are rejected + A surprisingly large number of Unicode characters are not safe to display, especially when the email address is concatenated with other text, so this library tries to protect you by not permitting reserved, non-, private use, @@ -216,48 +237,10 @@ cannot combine with something outside of the email address string or with the @-sign). See https://qntm.org/safe and https://trojansource.codes/ for relevant prior work. (Other than whitespace, these are checks that you should be applying to nearly all user inputs in a security-sensitive -context.) - -These character checks are performed after Unicode normalization (see below), -so you are only fully protected if you replace all user-provided email addresses -with the normalized email address string returned by this library. This does not -guard against the well known problem that many Unicode characters look alike -(or are identical), which can be used to fool humans reading displayed text. - -Email addresses with these non-ASCII characters require that your mail -submission library and the mail servers along the route to the destination, -including your own outbound mail server, all support the -[SMTPUTF8 (RFC 6531)](https://tools.ietf.org/html/rfc6531) extension. -Support for SMTPUTF8 varies. See the `allow_smtputf8` parameter. - -### If you know ahead of time that SMTPUTF8 is not supported by your mail submission stack +context.) This does not guard against the well known problem that many +Unicode characters look alike, which can be used to fool humans reading +displayed text. -By default all internationalized forms are accepted by the validator. -But if you know ahead of time that SMTPUTF8 is not supported by your -mail submission stack, then you must filter out addresses that require -SMTPUTF8 using the `allow_smtputf8=False` keyword argument (see above). -This will cause the validation function to raise a `EmailSyntaxError` if -delivery would require SMTPUTF8. That's just in those cases where -non-ASCII characters appear before the @-sign. If you do not set -`allow_smtputf8=False`, you can also check the value of the `smtputf8` -field in the returned object. - -If your mail submission library doesn't support Unicode at all --- even -in the domain part of the address --- then immediately prior to mail -submission you must replace the email address with its ASCII-ized form. -This library gives you back the ASCII-ized form in the `ascii_email` -field in the returned object, which you can get like this: - -```python -emailinfo = validate_email(email, allow_smtputf8=False) -email = emailinfo.ascii_email -``` - -The local part is left alone (if it has internationalized characters -`allow_smtputf8=False` will force validation to fail) and the domain -part is converted to [IDNA ASCII](https://tools.ietf.org/html/rfc5891). -(You probably should not do this at account creation time so you don't -change the user's login information without telling them.) Normalization ------------- @@ -272,7 +255,7 @@ address. For example, the CJK fullwidth Latin letters are considered semantically equivalent in domain names to their ASCII counterparts. This library -normalizes them to their ASCII counterparts: +normalizes them to their ASCII counterparts (as required by IDNA): ```python emailinfo = validate_email("me@Domain.com") @@ -285,9 +268,7 @@ Because an end-user might type their email address in different (but equivalent) un-normalized forms at different times, you ought to replace what they enter with the normalized form immediately prior to going into your database (during account creation), querying your database -(during login), or sending outbound mail. Normalization may also change -the length of an email address, and this may affect whether it is valid -and acceptable by your SMTP provider. +(during login), or sending outbound mail. The normalizations include lowercasing the domain part of the email address (domain names are case-insensitive), [Unicode "NFC" @@ -301,6 +282,11 @@ in the domain part, possibly other [UTS46](http://unicode.org/reports/tr46) mappings on the domain part, and conversion from Punycode to Unicode characters. +Normalization may change the characters in the email address and the +length of the email address, such that a string might be a valid address +before normalization but invalid after, or vice versa. This library only +permits addresses that are valid both before and after normalization. + (See [RFC 6532 (internationalized email) section 3.1](https://tools.ietf.org/html/rfc6532#section-3.1) and [RFC 5895 (IDNA 2008) section 2](http://www.ietf.org/rfc/rfc5895.txt).) @@ -315,13 +301,6 @@ they are unnecessary. For IPv6 domain literals, the IPv6 address is normalized to condensed form. [RFC 2142](https://datatracker.ietf.org/doc/html/rfc2142) also requires lowercase normalization for some specific mailbox names like `postmaster@`. -### Length checks - -This library checks that the length of the email address is not longer than -the maximum length. The check is performed on the normalized form of the -address, which might be different from a string provided by a user. If you -send email to the original string and not the normalized address, the email -might be rejected because the original address could be too long. Examples -------- @@ -395,6 +374,7 @@ are: | `domain` | The canonical internationalized Unicode form of the domain part of the email address. If the returned string contains non-ASCII characters, either the [SMTPUTF8](https://tools.ietf.org/html/rfc6531) feature of your mail relay will be required to transmit the message or else the email address's domain part must be converted to IDNA ASCII first: Use `ascii_domain` field instead. | | `ascii_domain` | The [IDNA](https://tools.ietf.org/html/rfc5891) [Punycode](https://www.rfc-editor.org/rfc/rfc3492.txt)-encoded form of the domain part of the given email address, as it would be transmitted on the wire. | | `domain_address` | If domain literals are allowed and if the email address contains one, an `ipaddress.IPv4Address` or `ipaddress.IPv6Address` object. | +| `display_name` | If no display name was present and angle brackets do not surround the address, this will be `None`; otherwise, it will be set to the display name, or the empty string if there were angle brackets but no display name. If the display name was quoted, it will be unquoted and unescaped. | | `smtputf8` | A boolean indicating that the [SMTPUTF8](https://tools.ietf.org/html/rfc6531) feature of your mail relay will be required to transmit messages to this address because the local part of the address has non-ASCII characters (the local part cannot be IDNA-encoded). If `allow_smtputf8=False` is passed as an argument, this flag will always be false because an exception is raised if it would have been true. | | `mx` | A list of (priority, domain) tuples of MX records specified in the DNS for the domain (see [RFC 5321 section 5](https://tools.ietf.org/html/rfc5321#section-5)). May be `None` if the deliverability check could not be completed because of a temporary issue like a timeout. | | `mx_fallback_type` | `None` if an `MX` record is found. If no MX records are actually specified in DNS and instead are inferred, through an obsolete mechanism, from A or AAAA records, the value is the type of DNS record used instead (`A` or `AAAA`). May be `None` if the deliverability check could not be completed because of a temporary issue like a timeout. | @@ -458,4 +438,4 @@ git push --tags License ------- -This project is free of any copyright restrictions per the [Unlicense](https://unlicense.org/). (Prior to Feb. 4, 2024, the project was made available under the terms of the [CC0 1.0 Universal public domain dedication](http://creativecommons.org/publicdomain/zero/1.0/).) See [LICENSE](LICENSE) and [CONTRIBUTING.md](CONTRIBUTING.md). \ No newline at end of file +This project is free of any copyright restrictions per the [Unlicense](https://unlicense.org/). (Prior to Feb. 4, 2024, the project was made available under the terms of the [CC0 1.0 Universal public domain dedication](http://creativecommons.org/publicdomain/zero/1.0/).) See [LICENSE](LICENSE) and [CONTRIBUTING.md](CONTRIBUTING.md). diff --git a/email_validator/__init__.py b/email_validator/__init__.py index cd1b301..38d0741 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -1,6 +1,8 @@ +from typing import TYPE_CHECKING + # Export the main method, helper methods, and the public data types. -from .exceptions_types import ValidatedEmail, EmailNotValidError, \ - EmailSyntaxError, EmailUndeliverableError +from .exceptions import EmailNotValidError, EmailSyntaxError, EmailUndeliverableError +from .types import ValidatedEmail from .validate_email import validate_email from .version import __version__ @@ -9,12 +11,14 @@ "EmailSyntaxError", "EmailUndeliverableError", "caching_resolver", "__version__"] - -def caching_resolver(*args, **kwargs): - # Lazy load `deliverability` as it is slow to import (due to dns.resolver) +if TYPE_CHECKING: from .deliverability import caching_resolver +else: + def caching_resolver(*args, **kwargs): + # Lazy load `deliverability` as it is slow to import (due to dns.resolver) + from .deliverability import caching_resolver - return caching_resolver(*args, **kwargs) + return caching_resolver(*args, **kwargs) # These global attributes are a part of the library's API and can be @@ -23,8 +27,11 @@ def caching_resolver(*args, **kwargs): # Default values for keyword arguments. ALLOW_SMTPUTF8 = True +ALLOW_EMPTY_LOCAL = False ALLOW_QUOTED_LOCAL = False ALLOW_DOMAIN_LITERAL = False +ALLOW_DISPLAY_NAME = False +STRICT = False GLOBALLY_DELIVERABLE = True CHECK_DELIVERABILITY = True TEST_ENVIRONMENT = False diff --git a/email_validator/__main__.py b/email_validator/__main__.py index a414ff6..107b08b 100644 --- a/email_validator/__main__.py +++ b/email_validator/__main__.py @@ -12,23 +12,25 @@ # When using STDIN, no output will be given for valid email addresses. # # Keyword arguments to validate_email can be set in environment variables -# of the same name but upprcase (see below). +# of the same name but uppercase (see below). import json import os import sys +from typing import Any, Optional -from .validate_email import validate_email +from .validate_email import validate_email, _Resolver from .deliverability import caching_resolver -from .exceptions_types import EmailNotValidError +from .exceptions import EmailNotValidError -def main(dns_resolver=None): +def main(dns_resolver: Optional[_Resolver] = None) -> None: # The dns_resolver argument is for tests. # Set options from environment variables. - options = {} - for varname in ('ALLOW_SMTPUTF8', 'ALLOW_QUOTED_LOCAL', 'ALLOW_DOMAIN_LITERAL', + options: dict[str, Any] = {} + for varname in ('ALLOW_SMTPUTF8', 'ALLOW_EMPTY_LOCAL', 'ALLOW_QUOTED_LOCAL', 'ALLOW_DOMAIN_LITERAL', + 'ALLOW_DISPLAY_NAME', 'GLOBALLY_DELIVERABLE', 'CHECK_DELIVERABILITY', 'TEST_ENVIRONMENT'): if varname in os.environ: options[varname.lower()] = bool(os.environ[varname]) @@ -37,7 +39,7 @@ def main(dns_resolver=None): options[varname.lower()] = float(os.environ[varname]) if len(sys.argv) == 1: - # Validate the email addresses pased line-by-line on STDIN. + # Validate the email addresses passed line-by-line on STDIN. dns_resolver = dns_resolver or caching_resolver() for line in sys.stdin: email = line.strip() diff --git a/email_validator/deliverability.py b/email_validator/deliverability.py index 182331a..dc10809 100644 --- a/email_validator/deliverability.py +++ b/email_validator/deliverability.py @@ -1,22 +1,31 @@ -from typing import Optional, Any, Dict +from typing import Any, Optional, TypedDict -from .exceptions_types import EmailUndeliverableError +import ipaddress + +from .exceptions import EmailUndeliverableError import dns.resolver import dns.exception -def caching_resolver(*, timeout: Optional[int] = None, cache=None, dns_resolver=None): +def caching_resolver(*, timeout: Optional[int] = None, cache: Any = None, dns_resolver: Optional[dns.resolver.Resolver] = None) -> dns.resolver.Resolver: if timeout is None: from . import DEFAULT_TIMEOUT timeout = DEFAULT_TIMEOUT resolver = dns_resolver or dns.resolver.Resolver() - resolver.cache = cache or dns.resolver.LRUCache() # type: ignore - resolver.lifetime = timeout # type: ignore # timeout, in seconds + resolver.cache = cache or dns.resolver.LRUCache() + resolver.lifetime = timeout # timeout, in seconds return resolver -def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Optional[int] = None, dns_resolver=None): +DeliverabilityInfo = TypedDict("DeliverabilityInfo", { + "mx": list[tuple[int, str]], + "mx_fallback_type": Optional[str], + "unknown-deliverability": str, +}, total=False) + + +def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Optional[int] = None, dns_resolver: Optional[dns.resolver.Resolver] = None) -> DeliverabilityInfo: # Check that the domain resolves to an MX record. If there is no MX record, # try an A or AAAA record which is a deprecated fallback for deliverability. # Raises an EmailUndeliverableError on failure. On success, returns a dict @@ -34,7 +43,7 @@ def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Option elif timeout is not None: raise ValueError("It's not valid to pass both timeout and dns_resolver.") - deliverability_info: Dict[str, Any] = {} + deliverability_info: DeliverabilityInfo = {} try: try: @@ -57,10 +66,30 @@ def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Option deliverability_info["mx_fallback_type"] = None except dns.resolver.NoAnswer: - # If there was no MX record, fall back to an A record. (RFC 5321 Section 5) + # If there was no MX record, fall back to an A or AAA record + # (RFC 5321 Section 5). Check A first since it's more common. + + # If the A/AAAA response has no Globally Reachable IP address, + # treat the response as if it were NoAnswer, i.e., the following + # address types are not allowed fallbacks: Private-Use, Loopback, + # Link-Local, and some other obscure ranges. See + # https://www.iana.org/assignments/iana-ipv4-special-registry/iana-ipv4-special-registry.xhtml + # https://www.iana.org/assignments/iana-ipv6-special-registry/iana-ipv6-special-registry.xhtml + # (Issue #134.) + def is_global_addr(address: Any) -> bool: + try: + ipaddr = ipaddress.ip_address(address) + except ValueError: + return False + return ipaddr.is_global + try: response = dns_resolver.resolve(domain, "A") - deliverability_info["mx"] = [(0, str(r)) for r in response] + + if not any(is_global_addr(r.address) for r in response): + raise dns.resolver.NoAnswer # fall back to AAAA + + deliverability_info["mx"] = [(0, domain)] deliverability_info["mx_fallback_type"] = "A" except dns.resolver.NoAnswer: @@ -69,7 +98,11 @@ def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Option # (It's unclear if SMTP servers actually do this.) try: response = dns_resolver.resolve(domain, "AAAA") - deliverability_info["mx"] = [(0, str(r)) for r in response] + + if not any(is_global_addr(r.address) for r in response): + raise dns.resolver.NoAnswer + + deliverability_info["mx"] = [(0, domain)] deliverability_info["mx_fallback_type"] = "AAAA" except dns.resolver.NoAnswer as e: @@ -89,7 +122,6 @@ def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Option for rec in response: value = b"".join(rec.strings) if value.startswith(b"v=spf1 "): - deliverability_info["spf"] = value.decode("ascii", errors='replace') if value == b"v=spf1 -all": raise EmailUndeliverableError(f"The domain name {domain_i18n} does not send email.") except dns.resolver.NoAnswer: diff --git a/email_validator/exceptions.py b/email_validator/exceptions.py new file mode 100644 index 0000000..87ef13c --- /dev/null +++ b/email_validator/exceptions.py @@ -0,0 +1,13 @@ +class EmailNotValidError(ValueError): + """Parent class of all exceptions raised by this module.""" + pass + + +class EmailSyntaxError(EmailNotValidError): + """Exception raised when an email address fails validation because of its form.""" + pass + + +class EmailUndeliverableError(EmailNotValidError): + """Exception raised when an email address fails validation because its domain name does not appear deliverable.""" + pass diff --git a/email_validator/rfc_constants.py b/email_validator/rfc_constants.py index a6b9c59..e93441b 100644 --- a/email_validator/rfc_constants.py +++ b/email_validator/rfc_constants.py @@ -13,7 +13,7 @@ # RFC 3629 section 4, which appear to be the Unicode code points from # U+0080 to U+10FFFF. ATEXT_INTL = ATEXT + "\u0080-\U0010FFFF" -ATEXT_INTL_RE = re.compile('[.' + ATEXT_INTL + ']') # ATEXT_INTL plus dots +ATEXT_INTL_DOT_RE = re.compile('[.' + ATEXT_INTL + ']') # ATEXT_INTL plus dots DOT_ATOM_TEXT_INTL = re.compile('[' + ATEXT_INTL + ']+(?:\\.[' + ATEXT_INTL + r']+)*\Z') # The domain part of the email address, after IDNA (ASCII) encoding, @@ -30,19 +30,30 @@ # Quoted-string local part (RFC 5321 4.1.2, internationalized by RFC 6531 3.3) # The permitted characters in a quoted string are the characters in the range # 32-126, except that quotes and (literal) backslashes can only appear when escaped -# by a backslash. When internationalized, UTF8 strings are also permitted except +# by a backslash. When internationalized, UTF-8 strings are also permitted except # the ASCII characters that are not previously permitted (see above). # QUOTED_LOCAL_PART_ADDR = re.compile(r"^\"((?:[\u0020-\u0021\u0023-\u005B\u005D-\u007E]|\\[\u0020-\u007E])*)\"@(.*)") -QUOTED_LOCAL_PART_ADDR = re.compile(r"^\"((?:[^\"\\]|\\.)*)\"@(.*)") QTEXT_INTL = re.compile(r"[\u0020-\u007E\u0080-\U0010FFFF]") # Length constants + # RFC 3696 + errata 1003 + errata 1690 (https://www.rfc-editor.org/errata_search.php?rfc=3696&eid=1690) -# explains the maximum length of an email address is 254 octets. +# explains the maximum length of an email address is 254 octets based on RFC 5321 4.5.3.1.3. A +# maximum local part length is also given at RFC 5321 4.5.3.1.1. +# +# But RFC 5321 4.5.3.1 says that these (and other) limits are in a sense suggestions, and longer +# local parts have been seen in the wild. Consequntely, the local part length is only checked +# in "strict" mode. Although the email address maximum length is also somewhat of a suggestion, +# I don't like the idea of having no length checks performed, so I'm leaving that to always be +# checked. EMAIL_MAX_LENGTH = 254 LOCAL_PART_MAX_LENGTH = 64 + +# Although RFC 5321 4.5.3.1.2 gives a (suggested, see above) limit of 255 octets, RFC 1035 2.3.4 also +# imposes a length limit (255 octets). But per https://stackoverflow.com/questions/32290167/what-is-the-maximum-length-of-a-dns-name, +# two of those octets are taken up by the optional final dot and null root label. DNS_LABEL_LENGTH_LIMIT = 63 # in "octets", RFC 1035 2.3.1 -DOMAIN_MAX_LENGTH = 255 # in "octets", RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2 +DOMAIN_MAX_LENGTH = 253 # in "octets" as transmitted # RFC 2142 CASE_INSENSITIVE_MAILBOX_NAMES = [ diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 6634ace..4e87937 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -1,53 +1,211 @@ -from .exceptions_types import EmailSyntaxError +from .exceptions import EmailSyntaxError +from .types import ValidatedEmail from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \ - DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \ - DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS, \ - QUOTED_LOCAL_PART_ADDR + DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_DOT_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \ + DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS import re import unicodedata import idna # implements IDNA 2008; Python's codec is only IDNA 2003 import ipaddress -from typing import Optional - +from typing import Optional, TypedDict + + +def split_email(email: str) -> tuple[Optional[str], str, str, bool]: + # Return the display name, unescaped local part, and domain part + # of the address, and whether the local part was quoted. If no + # display name was present and angle brackets do not surround + # the address, display name will be None; otherwise, it will be + # set to the display name or the empty string if there were + # angle brackets but no display name. + + # Typical email addresses have a single @-sign and no quote + # characters, but the awkward "quoted string" local part form + # (RFC 5321 4.1.2) allows @-signs and escaped quotes to appear + # in the local part if the local part is quoted. + + # A `display name ` format is also present in MIME messages + # (RFC 5322 3.4) and this format is also often recognized in + # mail UIs. It's not allowed in SMTP commands or in typical web + # login forms, but parsing it has been requested, so it's done + # here as a convenience. It's implemented in the spirit but not + # the letter of RFC 5322 3.4 because MIME messages allow newlines + # and comments as a part of the CFWS rule, but this is typically + # not allowed in mail UIs (although comment syntax was requested + # once too). + # + # Display names are either basic characters (the same basic characters + # permitted in email addresses, but periods are not allowed and spaces + # are allowed; see RFC 5322 Appendix A.1.2), or or a quoted string with + # the same rules as a quoted local part. (Multiple quoted strings might + # be allowed? Unclear.) Optional space (RFC 5322 3.4 CFWS) and then the + # email address follows in angle brackets. + # + # An initial quote is ambiguous between starting a display name or + # a quoted local part --- fun. + # + # We assume the input string is already stripped of leading and + # trailing CFWS. + + def split_string_at_unquoted_special(text: str, specials: tuple[str, ...]) -> tuple[str, str]: + # Split the string at the first character in specials (an @-sign + # or left angle bracket) that does not occur within quotes and + # is not followed by a Unicode combining character. + # If no special character is found, raise an error. + inside_quote = False + escaped = False + left_part = "" + for i, c in enumerate(text): + # < plus U+0338 (Combining Long Solidus Overlay) normalizes to + # ≮ U+226E (Not Less-Than), and it would be confusing to treat + # the < as the start of "" syntax in that case. Likewise, + # if anything combines with an @ or ", we should probably not + # treat it as a special character. + if unicodedata.normalize("NFC", text[i:])[0] != c: + left_part += c + + elif inside_quote: + left_part += c + if c == '\\' and not escaped: + escaped = True + elif c == '"' and not escaped: + # The only way to exit the quote is an unescaped quote. + inside_quote = False + escaped = False + else: + escaped = False + elif c == '"': + left_part += c + inside_quote = True + elif c in specials: + # When unquoted, stop before a special character. + break + else: + left_part += c + + # No special symbol found. The special symbols always + # include an at-sign, so this always indicates a missing + # at-sign. The other symbol is optional. + if len(left_part) == len(text): + # The full-width at-sign might occur in CJK contexts. + # We can't accept it because we only accept addresess + # that are actually valid. But if this is common we + # may want to consider accepting and normalizing full- + # width characters for the other special symbols (and + # full-width dot is already accepted in internationalized + # domains) with a new option. + # See https://news.ycombinator.com/item?id=42235268. + if "@" in text: + raise EmailSyntaxError("The email address has the \"full-width\" at-sign (@) character instead of a regular at-sign.") + + # Check another near-homoglyph for good measure because + # homoglyphs in place of required characters could be + # very confusing. We may want to consider checking for + # homoglyphs anywhere we look for a special symbol. + if "﹫" in text: + raise EmailSyntaxError('The email address has the "small commercial at" character instead of a regular at-sign.') + + raise EmailSyntaxError("An email address must have an @-sign.") + + # The right part is whatever is left. + right_part = text[len(left_part):] + + return left_part, right_part + + def unquote_quoted_string(text: str) -> tuple[str, bool]: + # Remove surrounding quotes and unescape escaped backslashes + # and quotes. Escapes are parsed liberally. I think only + # backslashes and quotes can be escaped but we'll allow anything + # to be. + quoted = False + escaped = False + value = "" + for i, c in enumerate(text): + if quoted: + if escaped: + value += c + escaped = False + elif c == '\\': + escaped = True + elif c == '"': + if i != len(text) - 1: + raise EmailSyntaxError("Extra character(s) found after close quote: " + + ", ".join(safe_character_display(c) for c in text[i + 1:])) + break + else: + value += c + elif i == 0 and c == '"': + quoted = True + else: + value += c + + return value, quoted + + # Split the string at the first unquoted @-sign or left angle bracket. + left_part, right_part = split_string_at_unquoted_special(email, ("@", "<")) + + # If the right part starts with an angle bracket, + # then the left part is a display name and the rest + # of the right part up to the final right angle bracket + # is the email address, . + if right_part.startswith("<"): + # Remove space between the display name and angle bracket. + left_part = left_part.rstrip() + + # Unquote and unescape the display name. + display_name, display_name_quoted = unquote_quoted_string(left_part) + + # Check that only basic characters are present in a + # non-quoted display name. + if not display_name_quoted: + bad_chars = { + safe_character_display(c) + for c in display_name + if (not ATEXT_RE.match(c) and c != ' ') or c == '.' + } + if bad_chars: + raise EmailSyntaxError("The display name contains invalid characters when not quoted: " + ", ".join(sorted(bad_chars)) + ".") -def split_email(email): - # Return the local part and domain part of the address and - # whether the local part was quoted as a three-tuple. + # Check for other unsafe characters. + check_unsafe_chars(display_name, allow_space=True) - # Typical email addresses have a single @-sign, but the - # awkward "quoted string" local part form (RFC 5321 4.1.2) - # allows @-signs (and escaped quotes) to appear in the local - # part if the local part is quoted. If the address is quoted, - # split it at a non-escaped @-sign and unescape the escaping. - if m := QUOTED_LOCAL_PART_ADDR.match(email): - local_part, domain_part = m.groups() + # Check that the right part ends with an angle bracket + # but allow spaces after it, I guess. + if ">" not in right_part: + raise EmailSyntaxError("An open angle bracket at the start of the email address has to be followed by a close angle bracket at the end.") + right_part = right_part.rstrip(" ") + if right_part[-1] != ">": + raise EmailSyntaxError("There can't be anything after the email address.") - # Since backslash-escaping is no longer needed because - # the quotes are removed, remove backslash-escaping - # to return in the normalized form. - local_part = re.sub(r"\\(.)", "\\1", local_part) + # Remove the initial and trailing angle brackets. + addr_spec = right_part[1:].rstrip(">") - return local_part, domain_part, True + # Split the email address at the first unquoted @-sign. + local_part, domain_part = split_string_at_unquoted_special(addr_spec, ("@",)) + # Otherwise there is no display name. The left part is the local + # part and the right part is the domain. else: - # Split at the one and only at-sign. - parts = email.split('@') - if len(parts) != 2: - raise EmailSyntaxError("The email address is not valid. It must have exactly one @-sign.") - local_part, domain_part = parts - return local_part, domain_part, False + display_name = None + local_part, domain_part = left_part, right_part + + if domain_part.startswith("@"): + domain_part = domain_part[1:] + # Unquote the local part if it is quoted. + local_part, is_quoted_local_part = unquote_quoted_string(local_part) -def get_length_reason(addr, utf8=False, limit=EMAIL_MAX_LENGTH): + return display_name, local_part, domain_part, is_quoted_local_part + + +def get_length_reason(addr: str, limit: int) -> str: """Helper function to return an error message related to invalid length.""" diff = len(addr) - limit - prefix = "at least " if utf8 else "" suffix = "s" if diff > 1 else "" - return f"({prefix}{diff} character{suffix} too many)" + return f"({diff} character{suffix} too many)" -def safe_character_display(c): +def safe_character_display(c: str) -> str: # Return safely displayable characters in quotes. if c == '\\': return f"\"{c}\"" # can't use repr because it escapes it @@ -64,8 +222,14 @@ def safe_character_display(c): return unicodedata.name(c, h) +class LocalPartValidationResult(TypedDict): + local_part: str + ascii_local_part: Optional[str] + smtputf8: bool + + def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_empty_local: bool = False, - quoted_local_part: bool = False): + quoted_local_part: bool = False, strict: bool = False) -> LocalPartValidationResult: """Validates the syntax of the local part of an email address.""" if len(local) == 0: @@ -87,7 +251,7 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp # internationalized, then the UTF-8 encoding may be longer, but # that may not be relevant. We will check the total address length # instead. - if len(local) > LOCAL_PART_MAX_LENGTH: + if strict and len(local) > LOCAL_PART_MAX_LENGTH: reason = get_length_reason(local, limit=LOCAL_PART_MAX_LENGTH) raise EmailSyntaxError(f"The email address is too long before the @-sign {reason}.") @@ -138,8 +302,8 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp valid = "dot-atom" requires_smtputf8 = True - # There are no syntactic restrictions on quoted local parts, so if - # it was originally quoted, it is probably valid. More characters + # There are no dot-atom syntax restrictions on quoted local parts, so + # if it was originally quoted, it is probably valid. More characters # are allowed, like @-signs, spaces, and quotes, and there are no # restrictions on the placement of dots, as in dot-atom local parts. elif quoted_local_part: @@ -172,12 +336,8 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp valid = "quoted" # If the local part matches the internationalized dot-atom form or was quoted, - # perform normalization and additional checks for Unicode strings. + # perform additional checks for Unicode strings. if valid: - # RFC 6532 section 3.1 says that Unicode NFC normalization should be applied, - # so we'll return the normalized local part in the return value. - local = unicodedata.normalize("NFC", local) - # Check that the local part is a valid, safe, and sensible Unicode string. # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked # by DOT_ATOM_TEXT_INTL and QTEXT_INTL. Other characters may be permitted by the @@ -215,7 +375,7 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp bad_chars = { safe_character_display(c) for c in local - if not ATEXT_INTL_RE.match(c) + if not ATEXT_INTL_DOT_RE.match(c) } if bad_chars: raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".") @@ -229,7 +389,7 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp raise EmailSyntaxError("The email address contains invalid characters before the @-sign.") -def check_unsafe_chars(s, allow_space=False): +def check_unsafe_chars(s: str, allow_space: bool = False) -> None: # Check for unsafe characters or characters that would make the string # invalid or non-sensible Unicode. bad_chars = set() @@ -242,7 +402,7 @@ def check_unsafe_chars(s, allow_space=False): # Combining character in first position would combine with something # outside of the email address if concatenated, so they are not safe. # We also check if this occurs after the @-sign, which would not be - # sensible. + # sensible because it would modify the @-sign. if i == 0: bad_chars.add(c) elif category == "Zs": @@ -281,7 +441,7 @@ def check_unsafe_chars(s, allow_space=False): + ", ".join(safe_character_display(c) for c in sorted(bad_chars)) + ".") -def check_dot_atom(label, start_descr, end_descr, is_hostname): +def check_dot_atom(label: str, start_descr: str, end_descr: str, is_hostname: bool) -> None: # RFC 5322 3.2.3 if label.endswith("."): raise EmailSyntaxError(end_descr.format("period")) @@ -300,10 +460,45 @@ def check_dot_atom(label, start_descr, end_descr, is_hostname): raise EmailSyntaxError("An email address cannot have a period and a hyphen next to each other.") -def validate_email_domain_name(domain, test_environment=False, globally_deliverable=True): +def uts46_valid_char(char: str) -> bool: + # By exhaustively searching for characters rejected by + # for c in (chr(i) for i in range(0x110000)): + # idna.uts46_remap(c, std3_rules=False, transitional=False) + # I found the following rules are pretty close. + c = ord(char) + if 0x80 <= c <= 0x9f: + # 8-bit ASCII range. + return False + elif ((0x2010 <= c <= 0x2060 and not (0x2024 <= c <= 0x2026) and not (0x2028 <= c <= 0x202E)) + or c in (0x00AD, 0x2064, 0xFF0E) + or 0x200B <= c <= 0x200D + or 0x1BCA0 <= c <= 0x1BCA3): + # Characters that are permitted but fall into one of the + # tests below. + return True + elif unicodedata.category(chr(c)) in ("Cf", "Cn", "Co", "Cs", "Zs", "Zl", "Zp"): + # There are a bunch of Zs characters including regular space + # that are allowed by UTS46 but are not allowed in domain + # names anyway. + # + # There are some Cn (unassigned) characters that the idna + # package doesn't reject but we can, I think. + return False + elif "002E" in unicodedata.decomposition(chr(c)).split(" "): + # Characters that decompose into a sequence with a dot. + return False + return True + + +class DomainNameValidationResult(TypedDict): + ascii_domain: str + domain: str + + +def validate_email_domain_name(domain: str, test_environment: bool = False, globally_deliverable: bool = True) -> DomainNameValidationResult: """Validates the syntax of the domain part of an email address.""" - # Check for invalid characters before normalization. + # Check for invalid characters. # (RFC 952 plus RFC 6531 section 3.3 for internationalized addresses) bad_chars = { safe_character_display(c) @@ -319,18 +514,39 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera # they may not be valid, safe, or sensible Unicode strings. check_unsafe_chars(domain) + # Reject characters that would be rejected by UTS-46 normalization next but + # with an error message under our control. + bad_chars = { + safe_character_display(c) for c in domain + if not uts46_valid_char(c) + } + if bad_chars: + raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".") + # Perform UTS-46 normalization, which includes casefolding, NFC normalization, # and converting all label separators (the period/full stop, fullwidth full stop, # ideographic full stop, and halfwidth ideographic full stop) to regular dots. # It will also raise an exception if there is an invalid character in the input, - # such as "⒈" which is invalid because it would expand to include a dot. - # Since several characters are normalized to a dot, this has to come before + # such as "⒈" which is invalid because it would expand to include a dot and + # U+1FEF which normalizes to a backtick, which is not an allowed hostname character. + # Since several characters *are* normalized to a dot, this has to come before # checks related to dots, like check_dot_atom which comes next. + original_domain = domain try: domain = idna.uts46_remap(domain, std3_rules=False, transitional=False) except idna.IDNAError as e: raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).") from e + # Check for invalid characters after Unicode normalization which are not caught + # by uts46_remap (see tests for examples). + bad_chars = { + safe_character_display(c) + for c in domain + if not ATEXT_HOSTNAME_INTL.match(c) + } + if bad_chars: + raise EmailSyntaxError("The part after the @-sign contains invalid characters after Unicode normalization: " + ", ".join(sorted(bad_chars)) + ".") + # The domain part is made up dot-separated "labels." Each label must # have at least one character and cannot start or end with dashes, which # means there are some surprising restrictions on periods and dashes. @@ -355,29 +571,22 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera # the MTA must either support SMTPUTF8 or the mail client must convert the # domain name to IDNA before submission. # - # Unfortunately this step incorrectly 'fixes' domain names with leading - # periods by removing them, so we have to check for this above. It also gives - # a funky error message ("No input") when there are two periods in a - # row, also checked separately above. - # # For ASCII-only domains, the transformation does nothing and is safe to # apply. However, to ensure we don't rely on the idna library for basic # syntax checks, we don't use it if it's not needed. # - # uts46 is off here because it is handled above. + # idna.encode also checks the domain name length after encoding but it + # doesn't give a nice error, so we call the underlying idna.alabel method + # directly. idna.alabel checks label length and doesn't give great messages, + # but we can't easily go to lower level methods. try: - ascii_domain = idna.encode(domain, uts46=False).decode("ascii") + ascii_domain = ".".join( + idna.alabel(label).decode("ascii") + for label in domain.split(".") + ) except idna.IDNAError as e: - if "Domain too long" in str(e): - # We can't really be more specific because UTS-46 normalization means - # the length check is applied to a string that is different from the - # one the user supplied. Also I'm not sure if the length check applies - # to the internationalized form, the IDNA ASCII form, or even both! - raise EmailSyntaxError("The email address is too long after the @-sign.") from e - - # Other errors seem to not be possible because the call to idna.uts46_remap - # would have already raised them. - raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).") from e + # Some errors would have already been raised by idna.uts46_remap. + raise EmailSyntaxError(f"The part after the @-sign is invalid ({e}).") from e # Check the syntax of the string returned by idna.encode. # It should never fail. @@ -392,8 +601,13 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera # as IDNA ASCII. (This is also checked by idna.encode, so this exception # is never reached for internationalized domains.) if len(ascii_domain) > DOMAIN_MAX_LENGTH: - reason = get_length_reason(ascii_domain, limit=DOMAIN_MAX_LENGTH) - raise EmailSyntaxError(f"The email address is too long after the @-sign {reason}.") + if ascii_domain == original_domain: + reason = get_length_reason(ascii_domain, limit=DOMAIN_MAX_LENGTH) + raise EmailSyntaxError(f"The email address is too long after the @-sign {reason}.") + else: + diff = len(ascii_domain) - DOMAIN_MAX_LENGTH + s = "" if diff == 1 else "s" + raise EmailSyntaxError(f"The email address is too long after the @-sign ({diff} byte{s} too many after IDNA encoding).") # Also check the label length limit. # (RFC 1035 2.3.1) @@ -435,22 +649,38 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera # but not be actual IDNA. For ASCII-only domains, the conversion out # of IDNA just gives the same thing back. # - # This gives us the canonical internationalized form of the domain. + # This gives us the canonical internationalized form of the domain, + # which we return to the caller as a part of the normalized email + # address. try: domain_i18n = idna.decode(ascii_domain.encode('ascii')) except idna.IDNAError as e: raise EmailSyntaxError(f"The part after the @-sign is not valid IDNA ({e}).") from e - # Check for invalid characters after normalization. These - # should never arise. See the similar checks above. + # Check that this normalized domain name has not somehow become + # an invalid domain name. All of the checks before this point + # using the idna package probably guarantee that we now have + # a valid international domain name in most respects. But it + # doesn't hurt to re-apply some tests to be sure. See the similar + # tests above. + + # Check for invalid and unsafe characters. We have no test + # case for this. bad_chars = { safe_character_display(c) - for c in domain + for c in domain_i18n if not ATEXT_HOSTNAME_INTL.match(c) } if bad_chars: raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".") - check_unsafe_chars(domain) + check_unsafe_chars(domain_i18n) + + # Check that it can be encoded back to IDNA ASCII. We have no test + # case for this. + try: + idna.encode(domain_i18n) + except idna.IDNAError as e: + raise EmailSyntaxError(f"The part after the @-sign became invalid after normalizing to international characters ({e}).") from e # Return the IDNA ASCII-encoded form of the domain, which is how it # would be transmitted on the wire (except when used with SMTPUTF8 @@ -464,52 +694,81 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera } -def validate_email_length(addrinfo): - # If the email address has an ASCII representation, then we assume it may be - # transmitted in ASCII (we can't assume SMTPUTF8 will be used on all hops to - # the destination) and the length limit applies to ASCII characters (which is - # the same as octets). The number of characters in the internationalized form - # may be many fewer (because IDNA ASCII is verbose) and could be less than 254 - # Unicode characters, and of course the number of octets over the limit may - # not be the number of characters over the limit, so if the email address is - # internationalized, we can't give any simple information about why the address - # is too long. - if addrinfo.ascii_email and len(addrinfo.ascii_email) > EMAIL_MAX_LENGTH: - if addrinfo.ascii_email == addrinfo.normalized: - reason = get_length_reason(addrinfo.ascii_email) - elif len(addrinfo.normalized) > EMAIL_MAX_LENGTH: - # If there are more than 254 characters, then the ASCII - # form is definitely going to be too long. - reason = get_length_reason(addrinfo.normalized, utf8=True) - else: - reason = "(when converted to IDNA ASCII)" - raise EmailSyntaxError(f"The email address is too long {reason}.") - - # In addition, check that the UTF-8 encoding (i.e. not IDNA ASCII and not - # Unicode characters) is at most 254 octets. If the addres is transmitted using - # SMTPUTF8, then the length limit probably applies to the UTF-8 encoded octets. - # If the email address has an ASCII form that differs from its internationalized - # form, I don't think the internationalized form can be longer, and so the ASCII - # form length check would be sufficient. If there is no ASCII form, then we have - # to check the UTF-8 encoding. The UTF-8 encoding could be up to about four times - # longer than the number of characters. +def validate_email_length(addrinfo: ValidatedEmail) -> None: + # There are three forms of the email address whose length must be checked: # - # See the length checks on the local part and the domain. - if len(addrinfo.normalized.encode("utf8")) > EMAIL_MAX_LENGTH: - if len(addrinfo.normalized) > EMAIL_MAX_LENGTH: - # If there are more than 254 characters, then the UTF-8 - # encoding is definitely going to be too long. - reason = get_length_reason(addrinfo.normalized, utf8=True) - else: - reason = "(when encoded in bytes)" - raise EmailSyntaxError(f"The email address is too long {reason}.") - - -def validate_email_domain_literal(domain_literal): + # 1) The original email address string. Since callers may continue to use + # this string, even though we recommend using the normalized form, we + # should not pass validation when the original input is not valid. This + # form is checked first because it is the original input. + # 2) The normalized email address. We perform Unicode NFC normalization of + # the local part, we normalize the domain to internationalized characters + # (if originally IDNA ASCII) which also includes Unicode normalization, + # and we may remove quotes in quoted local parts. We recommend that + # callers use this string, so it must be valid. + # 3) The email address with the IDNA ASCII representation of the domain + # name, since this string may be used with email stacks that don't + # support UTF-8. Since this is the least likely to be used by callers, + # it is checked last. Note that ascii_email will only be set if the + # local part is ASCII, but conceivably the caller may combine a + # internationalized local part with an ASCII domain, so we check this + # on that combination also. Since we only return the normalized local + # part, we use that (and not the unnormalized local part). + # + # In all cases, the length is checked in UTF-8 because the SMTPUTF8 + # extension to SMTP validates the length in bytes. + + addresses_to_check = [ + (addrinfo.original, None), + (addrinfo.normalized, "after normalization"), + ((addrinfo.ascii_local_part or addrinfo.local_part or "") + "@" + addrinfo.ascii_domain, "when the part after the @-sign is converted to IDNA ASCII"), + ] + + for addr, reason in addresses_to_check: + addr_len = len(addr) + addr_utf8_len = len(addr.encode("utf8")) + diff = addr_utf8_len - EMAIL_MAX_LENGTH + if diff > 0: + if reason is None and addr_len == addr_utf8_len: + # If there is no normalization or transcoding, + # we can give a simple count of the number of + # characters over the limit. + reason = get_length_reason(addr, limit=EMAIL_MAX_LENGTH) + elif reason is None: + # If there is no normalization but there is + # some transcoding to UTF-8, we can compute + # the minimum number of characters over the + # limit by dividing the number of bytes over + # the limit by the maximum number of bytes + # per character. + mbpc = max(len(c.encode("utf8")) for c in addr) + mchars = max(1, diff // mbpc) + suffix = "s" if diff > 1 else "" + if mchars == diff: + reason = f"({diff} character{suffix} too many)" + else: + reason = f"({mchars}-{diff} character{suffix} too many)" + else: + # Since there is normalization, the number of + # characters in the input that need to change is + # impossible to know. + suffix = "s" if diff > 1 else "" + reason += f" ({diff} byte{suffix} too many)" + raise EmailSyntaxError(f"The email address is too long {reason}.") + + +class DomainLiteralValidationResult(TypedDict): + domain_address: ipaddress.IPv4Address | ipaddress.IPv6Address + domain: str + + +def validate_email_domain_literal(domain_literal: str) -> DomainLiteralValidationResult: # This is obscure domain-literal syntax. Parse it and return # a compressed/normalized address. # RFC 5321 4.1.3 and RFC 5322 3.4.1. + addr: ipaddress.IPv4Address | ipaddress.IPv6Address + # Try to parse the domain literal as an IPv4 address. # There is no tag for IPv4 addresses, so we can never # be sure if the user intends an IPv4 address. diff --git a/email_validator/exceptions_types.py b/email_validator/types.py similarity index 84% rename from email_validator/exceptions_types.py rename to email_validator/types.py index 4522b4f..42cf174 100644 --- a/email_validator/exceptions_types.py +++ b/email_validator/types.py @@ -1,20 +1,5 @@ import warnings -from typing import Optional - - -class EmailNotValidError(ValueError): - """Parent class of all exceptions raised by this module.""" - pass - - -class EmailSyntaxError(EmailNotValidError): - """Exception raised when an email address fails validation because of its form.""" - pass - - -class EmailUndeliverableError(EmailNotValidError): - """Exception raised when an email address fails validation because its domain name does not appear deliverable.""" - pass +from typing import Any, Optional class ValidatedEmail: @@ -24,7 +9,7 @@ class ValidatedEmail: """The email address that was passed to validate_email. (If passed as bytes, this will be a string.)""" original: str - """The normalized email address, which should always be used in preferance to the original address. + """The normalized email address, which should always be used in preference to the original address. The normalized address converts an IDNA ASCII domain name to Unicode, if possible, and performs Unicode normalization on the local part and on the domain (if originally Unicode). It is the concatenation of the local_part and domain attributes, separated by an @-sign.""" @@ -56,22 +41,20 @@ class ValidatedEmail: """If a deliverability check is performed and if it succeeds, a list of (priority, domain) tuples of MX records specified in the DNS for the domain.""" - mx: list + mx: list[tuple[int, str]] """If no MX records are actually specified in DNS and instead are inferred, through an obsolete mechanism, from A or AAAA records, the value is the type of DNS record used instead (`A` or `AAAA`).""" - mx_fallback_type: str + mx_fallback_type: Optional[str] - """Tests use this constructor.""" - def __init__(self, **kwargs): - for k, v in kwargs.items(): - setattr(self, k, v) + """The display name in the original input text, unquoted and unescaped, or None.""" + display_name: Optional[str] - def __repr__(self): + def __repr__(self) -> str: return f"" """For backwards compatibility, support old field names.""" - def __getattr__(self, key): + def __getattr__(self, key: str) -> str: if key == "original_email": return self.original if key == "email": @@ -79,13 +62,13 @@ def __getattr__(self, key): raise AttributeError(key) @property - def email(self): + def email(self) -> str: warnings.warn("ValidatedEmail.email is deprecated and will be removed, use ValidatedEmail.normalized instead", DeprecationWarning) return self.normalized """For backwards compatibility, some fields are also exposed through a dict-like interface. Note that some of the names changed when they became attributes.""" - def __getitem__(self, key): + def __getitem__(self, key: str) -> Optional[str] | bool | list[tuple[int, str]]: warnings.warn("dict-like access to the return value of validate_email is deprecated and may not be supported in the future.", DeprecationWarning, stacklevel=2) if key == "email": return self.normalized @@ -106,7 +89,7 @@ def __getitem__(self, key): raise KeyError() """Tests use this.""" - def __eq__(self, other): + def __eq__(self, other: object) -> bool: if not isinstance(other, ValidatedEmail): return False return ( @@ -120,21 +103,23 @@ def __eq__(self, other): and repr(sorted(self.mx) if getattr(self, 'mx', None) else None) == repr(sorted(other.mx) if getattr(other, 'mx', None) else None) and getattr(self, 'mx_fallback_type', None) == getattr(other, 'mx_fallback_type', None) + and getattr(self, 'display_name', None) == getattr(other, 'display_name', None) ) """This helps producing the README.""" - def as_constructor(self): + def as_constructor(self) -> str: return "ValidatedEmail(" \ + ",".join(f"\n {key}={repr(getattr(self, key))}" for key in ('normalized', 'local_part', 'domain', 'ascii_email', 'ascii_local_part', 'ascii_domain', - 'smtputf8', 'mx', 'mx_fallback_type') + 'smtputf8', 'mx', 'mx_fallback_type', + 'display_name') if hasattr(self, key) ) \ + ")" """Convenience method for accessing ValidatedEmail as a dict""" - def as_dict(self): + def as_dict(self) -> dict[str, Any]: d = self.__dict__ if d.get('domain_address'): d['domain_address'] = repr(d['domain_address']) diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index d6051a9..5837ed2 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -1,23 +1,33 @@ -from typing import Optional, Union +from typing import Optional, TYPE_CHECKING +import unicodedata -from .exceptions_types import EmailSyntaxError, ValidatedEmail +from .exceptions import EmailSyntaxError +from .types import ValidatedEmail from .syntax import split_email, validate_email_local_part, validate_email_domain_name, validate_email_domain_literal, validate_email_length from .rfc_constants import CASE_INSENSITIVE_MAILBOX_NAMES +if TYPE_CHECKING: + import dns.resolver + _Resolver = dns.resolver.Resolver +else: + _Resolver = object + def validate_email( - email: Union[str, bytes], + email: str | bytes, /, # prior arguments are positional-only *, # subsequent arguments are keyword-only allow_smtputf8: Optional[bool] = None, - allow_empty_local: bool = False, + allow_empty_local: Optional[bool] = None, allow_quoted_local: Optional[bool] = None, allow_domain_literal: Optional[bool] = None, + allow_display_name: Optional[bool] = None, + strict: Optional[bool] = None, check_deliverability: Optional[bool] = None, test_environment: Optional[bool] = None, globally_deliverable: Optional[bool] = None, timeout: Optional[int] = None, - dns_resolver: Optional[object] = None + dns_resolver: Optional[_Resolver] = None ) -> ValidatedEmail: """ Given an email address, and some options, returns a ValidatedEmail instance @@ -26,14 +36,20 @@ def validate_email( """ # Fill in default values of arguments. - from . import ALLOW_SMTPUTF8, ALLOW_QUOTED_LOCAL, ALLOW_DOMAIN_LITERAL, \ - GLOBALLY_DELIVERABLE, CHECK_DELIVERABILITY, TEST_ENVIRONMENT, DEFAULT_TIMEOUT + from . import ALLOW_SMTPUTF8, ALLOW_EMPTY_LOCAL, ALLOW_QUOTED_LOCAL, ALLOW_DOMAIN_LITERAL, ALLOW_DISPLAY_NAME, \ + STRICT, GLOBALLY_DELIVERABLE, CHECK_DELIVERABILITY, TEST_ENVIRONMENT, DEFAULT_TIMEOUT if allow_smtputf8 is None: allow_smtputf8 = ALLOW_SMTPUTF8 + if allow_empty_local is None: + allow_empty_local = ALLOW_EMPTY_LOCAL if allow_quoted_local is None: allow_quoted_local = ALLOW_QUOTED_LOCAL if allow_domain_literal is None: allow_domain_literal = ALLOW_DOMAIN_LITERAL + if allow_display_name is None: + allow_display_name = ALLOW_DISPLAY_NAME + if strict is None: + strict = STRICT if check_deliverability is None: check_deliverability = CHECK_DELIVERABILITY if test_environment is None: @@ -43,26 +59,41 @@ def validate_email( if timeout is None and dns_resolver is None: timeout = DEFAULT_TIMEOUT - # Allow email to be a str or bytes instance. If bytes, - # it must be ASCII because that's how the bytes work - # on the wire with SMTP. - if not isinstance(email, str): + if isinstance(email, str): + pass + elif isinstance(email, bytes): + # Allow email to be a bytes instance as if it is what + # will be transmitted on the wire. But assume SMTPUTF8 + # is unavailable, so it must be ASCII. try: email = email.decode("ascii") except ValueError as e: raise EmailSyntaxError("The email address is not valid ASCII.") from e + else: + raise TypeError("email must be str or bytes") - # Split the address into the local part (before the @-sign) - # and the domain part (after the @-sign). Normally, there - # is only one @-sign. But the awkward "quoted string" local - # part form (RFC 5321 4.1.2) allows @-signs in the local + # Split the address into the display name (or None), the local part + # (before the @-sign), and the domain part (after the @-sign). + # Normally, there is only one @-sign. But the awkward "quoted string" + # local part form (RFC 5321 4.1.2) allows @-signs in the local # part if the local part is quoted. - local_part, domain_part, is_quoted_local_part \ + display_name, local_part, domain_part, is_quoted_local_part \ = split_email(email) + if display_name: + # UTS #39 3.3 Email Security Profiles for Identifiers requires + # display names (incorrectly called "quoted-string-part" there) + # to be NFC normalized. Since these are not a part of what we + # are really validating, we won't check that the input was NFC + # normalized, but we'll normalize in output. + display_name = unicodedata.normalize("NFC", display_name) + # Collect return values in this instance. ret = ValidatedEmail() - ret.original = email + ret.original = ((local_part if not is_quoted_local_part + else ('"' + local_part + '"')) + + "@" + domain_part) # drop the display name, if any, for email length tests at the end + ret.display_name = display_name # Validate the email address's local part syntax and get a normalized form. # If the original address was quoted and the decoded local part is a valid @@ -71,11 +102,36 @@ def validate_email( local_part_info = validate_email_local_part(local_part, allow_smtputf8=allow_smtputf8, allow_empty_local=allow_empty_local, - quoted_local_part=is_quoted_local_part) + quoted_local_part=is_quoted_local_part, + strict=strict) ret.local_part = local_part_info["local_part"] ret.ascii_local_part = local_part_info["ascii_local_part"] ret.smtputf8 = local_part_info["smtputf8"] + # RFC 6532 section 3.1 says that Unicode NFC normalization should be applied, + # so we'll return the NFC-normalized local part. Since the caller may use that + # string in place of the original string, ensure it is also valid. + # + # UTS #39 3.3 Email Security Profiles for Identifiers requires local parts + # to be NFKC normalized, which loses some information in characters that can + # be decomposed. We might want to consider applying NFKC normalization, but + # we can't make the change easily because it would break database lookups + # for any caller that put a normalized address from a previous version of + # this library. (UTS #39 seems to require that the *input* be NKFC normalized + # and has other requirements that are hard to check without additional Unicode + # data, and I don't know whether the rules really apply in the wild.) + normalized_local_part = unicodedata.normalize("NFC", ret.local_part) + if normalized_local_part != ret.local_part: + try: + validate_email_local_part(normalized_local_part, + allow_smtputf8=allow_smtputf8, + allow_empty_local=allow_empty_local, + quoted_local_part=is_quoted_local_part, + strict=strict) + except EmailSyntaxError as e: + raise EmailSyntaxError("After Unicode normalization: " + str(e)) from e + ret.local_part = normalized_local_part + # If a quoted local part isn't allowed but is present, now raise an exception. # This is done after any exceptions raised by validate_email_local_part so # that mandatory checks have highest precedence. @@ -98,20 +154,20 @@ def validate_email( elif domain_part.startswith("[") and domain_part.endswith("]"): # Parse the address in the domain literal and get back a normalized domain. - domain_part_info = validate_email_domain_literal(domain_part[1:-1]) + domain_literal_info = validate_email_domain_literal(domain_part[1:-1]) if not allow_domain_literal: raise EmailSyntaxError("A bracketed IP address after the @-sign is not allowed here.") - ret.domain = domain_part_info["domain"] - ret.ascii_domain = domain_part_info["domain"] # Domain literals are always ASCII. - ret.domain_address = domain_part_info["domain_address"] + ret.domain = domain_literal_info["domain"] + ret.ascii_domain = domain_literal_info["domain"] # Domain literals are always ASCII. + ret.domain_address = domain_literal_info["domain_address"] is_domain_literal = True # Prevent deliverability checks. else: # Check the syntax of the domain and get back a normalized # internationalized and ASCII form. - domain_part_info = validate_email_domain_name(domain_part, test_environment=test_environment, globally_deliverable=globally_deliverable) - ret.domain = domain_part_info["domain"] - ret.ascii_domain = domain_part_info["ascii_domain"] + domain_name_info = validate_email_domain_name(domain_part, test_environment=test_environment, globally_deliverable=globally_deliverable) + ret.domain = domain_name_info["domain"] + ret.ascii_domain = domain_name_info["ascii_domain"] # Construct the complete normalized form. ret.normalized = ret.local_part + "@" + ret.domain @@ -127,6 +183,11 @@ def validate_email( # Check the length of the address. validate_email_length(ret) + # Check that a display name is permitted. It's the last syntax check + # because we always check against optional parsing features last. + if display_name is not None and not allow_display_name: + raise EmailSyntaxError("A display name and angle brackets around the email address are not permitted here.") + if check_deliverability and not test_environment: # Validate the email address's deliverability using DNS # and update the returned ValidatedEmail object with metadata. @@ -140,7 +201,9 @@ def validate_email( deliverability_info = validate_email_deliverability( ret.ascii_domain, ret.domain, timeout, dns_resolver ) - for key, value in deliverability_info.items(): - setattr(ret, key, value) + mx = deliverability_info.get("mx") + if mx is not None: + ret.mx = mx + ret.mx_fallback_type = deliverability_info.get("mx_fallback_type") return ret diff --git a/email_validator/version.py b/email_validator/version.py index 58039f5..55e4709 100644 --- a/email_validator/version.py +++ b/email_validator/version.py @@ -1 +1 @@ -__version__ = "2.1.1" +__version__ = "2.3.0" diff --git a/pyproject.toml b/pyproject.toml index 1379d17..a92c08e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,16 @@ +[tool.mypy] +disallow_any_generics = true +disallow_subclassing_any = true + +check_untyped_defs = true +disallow_incomplete_defs = true +disallow_untyped_calls = true +disallow_untyped_decorators = true +disallow_untyped_defs = true + +warn_redundant_casts = true +warn_unused_ignores = true + [tool.pytest.ini_options] markers = [ "network: marks tests as requiring Internet access", diff --git a/setup.cfg b/setup.cfg index 3387df1..8ceac96 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [metadata] -name = email_validator +name = email-validator version = attr: email_validator.version.__version__ description = A robust email address syntax and deliverability validation library. long_description = file: README.md diff --git a/test_requirements.txt b/test_requirements.txt index d05813d..bea5d5a 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -7,20 +7,20 @@ # the earliest Python version we support, and some exception # messages may depend on package versions, so we pin versions # for reproducible testing.) -coverage==7.4.4 +coverage==7.5.3 dnspython==2.6.1 -exceptiongroup==1.2.0 -flake8==7.0.0 +exceptiongroup==1.2.1 +flake8==7.1.0 idna==3.7 iniconfig==2.0.0 mccabe==0.7.0 -mypy==1.9.0 +mypy==1.10.0 mypy-extensions==1.0.0 -packaging==24.0 -pluggy==1.4.0 -pycodestyle==2.11.1 +packaging==24.1 +pluggy==1.5.0 +pycodestyle==2.12.0 pyflakes==3.2.0 -pytest==8.1.1 +pytest==8.2.2 pytest-cov==5.0.0 tomli==2.0.1 -typing_extensions==4.11.0 +typing_extensions==4.12.2 diff --git a/tests/mocked-dns-answers.json b/tests/mocked-dns-answers.json index 19e443c..8da7db6 100644 --- a/tests/mocked-dns-answers.json +++ b/tests/mocked-dns-answers.json @@ -13,6 +13,35 @@ "5 gmail-smtp-in.l.google.com." ] }, + { + "query": { + "name": "pages.github.com", + "type": "MX", + "class": "IN" + }, + "answer": [] + }, + { + "query": { + "name": "pages.github.com", + "type": "A", + "class": "IN" + }, + "answer": [ + "185.199.108.153", + "185.199.109.153", + "185.199.110.153", + "185.199.111.153" + ] + }, + { + "query": { + "name": "pages.github.com", + "type": "TXT", + "class": "IN" + }, + "answer": [] + }, { "query": { "name": "xkxufoekjvjfjeodlfmdfjcu.com", @@ -31,6 +60,32 @@ "0 ." ] }, + { + "query": { + "name": "g.mail.com", + "type": "MX", + "class": "IN" + }, + "answer": [] + }, + { + "query": { + "name": "g.mail.com", + "type": "A", + "class": "IN" + }, + "answer": [] + }, + { + "query": { + "name": "g.mail.com", + "type": "AAAA", + "class": "IN" + }, + "answer": [ + "::1" + ] + }, { "query": { "name": "nellis.af.mil", @@ -74,7 +129,7 @@ "type": "A", "class": "IN" }, - "answer": [] + "answer": ["127.0.0.1"] }, { "query": { @@ -84,6 +139,30 @@ }, "answer": [] }, + { + "query": { + "name": "ipv6only.joshdata.me", + "type": "MX", + "class": "IN" + }, + "answer": [] + }, + { + "query": { + "name": "ipv6only.joshdata.me", + "type": "A", + "class": "IN" + }, + "answer": [] + }, + { + "query": { + "name": "ipv6only.joshdata.me", + "type": "AAAA", + "class": "IN" + }, + "answer": ["::1"] + }, { "query": { "name": "mail.example", diff --git a/tests/mocked_dns_response.py b/tests/mocked_dns_response.py index 1c7d157..43508a3 100644 --- a/tests/mocked_dns_response.py +++ b/tests/mocked_dns_response.py @@ -1,3 +1,7 @@ +from typing import Any, Iterator, Optional + +import dns.exception +import dns.rdataset import dns.resolver import json import os.path @@ -20,9 +24,11 @@ class MockedDnsResponseData: DATA_PATH = os.path.dirname(__file__) + "/mocked-dns-answers.json" + INSTANCE = None + @staticmethod - def create_resolver(): - if not hasattr(MockedDnsResponseData, 'INSTANCE'): + def create_resolver() -> dns.resolver.Resolver: + if MockedDnsResponseData.INSTANCE is None: # Create a singleton instance of this class and load the saved DNS responses. # Except when BUILD_MOCKED_DNS_RESPONSE_DATA is true, don't load the data. singleton = MockedDnsResponseData() @@ -35,20 +41,19 @@ def create_resolver(): dns_resolver = dns.resolver.Resolver(configure=BUILD_MOCKED_DNS_RESPONSE_DATA) return caching_resolver(cache=MockedDnsResponseData.INSTANCE, dns_resolver=dns_resolver) - def __init__(self): - self.data = {} + def __init__(self) -> None: + self.data: dict[dns.resolver.CacheKey, Optional[MockedDnsResponseData.Ans]] = {} - def load(self): - # Loads the saved DNS response data from the JSON file and - # re-structures it into dnspython classes. - class Ans: # mocks the dns.resolver.Answer class + # Loads the saved DNS response data from the JSON file and + # re-structures it into dnspython classes. + class Ans: # mocks the dns.resolver.Answer class + def __init__(self, rrset: dns.rdataset.Rdataset) -> None: + self.rrset = rrset - def __init__(self, rrset): - self.rrset = rrset - - def __iter__(self): - return iter(self.rrset) + def __iter__(self) -> Iterator[Any]: + return iter(self.rrset) + def load(self) -> None: with open(self.DATA_PATH) as f: data = json.load(f) for item in data: @@ -60,11 +65,11 @@ def __iter__(self): for rr in item["answer"] ] if item["answer"]: - self.data[key] = Ans(dns.rdataset.from_rdata_list(0, rdatas=rdatas)) + self.data[key] = MockedDnsResponseData.Ans(dns.rdataset.from_rdata_list(0, rdatas=rdatas)) else: self.data[key] = None - def save(self): + def save(self) -> None: # Re-structure as a list with basic data types. data = [ { @@ -79,14 +84,15 @@ def save(self): ]) } for key, value in self.data.items() + if value is not None ] with open(self.DATA_PATH, "w") as f: json.dump(data, f, indent=True) - def get(self, key): + def get(self, key: dns.resolver.CacheKey) -> Optional[Ans]: # Special-case a domain to create a timeout. if key[0].to_text() == "timeout.com.": - raise dns.exception.Timeout() + raise dns.exception.Timeout() # type: ignore [no-untyped-call] # When building the DNS response database, return # a cache miss. @@ -96,17 +102,17 @@ def get(self, key): # Query the data for a matching record. if key in self.data: if not self.data[key]: - raise dns.resolver.NoAnswer() + raise dns.resolver.NoAnswer() # type: ignore [no-untyped-call] return self.data[key] # Query the data for a response to an ANY query. ANY = dns.rdatatype.from_text("ANY") if (key[0], ANY, key[2]) in self.data and self.data[(key[0], ANY, key[2])] is None: - raise dns.resolver.NXDOMAIN() + raise dns.resolver.NXDOMAIN() # type: ignore [no-untyped-call] raise ValueError(f"Saved DNS data did not contain query: {key}") - def put(self, key, value): + def put(self, key: dns.resolver.CacheKey, value: Ans) -> None: # Build the DNS data by saving the live query response. if not BUILD_MOCKED_DNS_RESPONSE_DATA: raise ValueError("Should not get here.") @@ -114,8 +120,8 @@ def put(self, key, value): @pytest.fixture(scope="session", autouse=True) -def MockedDnsResponseDataCleanup(request): - def cleanup_func(): - if BUILD_MOCKED_DNS_RESPONSE_DATA: +def MockedDnsResponseDataCleanup(request: pytest.FixtureRequest) -> None: + def cleanup_func() -> None: + if BUILD_MOCKED_DNS_RESPONSE_DATA and MockedDnsResponseData.INSTANCE is not None: MockedDnsResponseData.INSTANCE.save() request.addfinalizer(cleanup_func) diff --git a/tests/test_deliverability.py b/tests/test_deliverability.py index 52124eb..e1307c2 100644 --- a/tests/test_deliverability.py +++ b/tests/test_deliverability.py @@ -1,3 +1,5 @@ +from typing import Any + import pytest import re @@ -10,36 +12,34 @@ RESOLVER = MockedDnsResponseData.create_resolver() -def test_deliverability_found(): - response = validate_email_deliverability('gmail.com', 'gmail.com', dns_resolver=RESOLVER) - assert response.keys() == {'mx', 'mx_fallback_type'} - assert response['mx_fallback_type'] is None - assert len(response['mx']) > 1 - assert len(response['mx'][0]) == 2 - assert isinstance(response['mx'][0][0], int) - assert response['mx'][0][1].endswith('.com') - - -def test_deliverability_fails(): - # Domain does not exist. - domain = 'xkxufoekjvjfjeodlfmdfjcu.com' - with pytest.raises(EmailUndeliverableError, match=f'The domain name {domain} does not exist'): - validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) - - # Null MX record. - domain = 'example.com' - with pytest.raises(EmailUndeliverableError, match=f'The domain name {domain} does not accept email'): - validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) +@pytest.mark.parametrize( + 'domain,expected_response', + [ + ('gmail.com', {'mx': [(5, 'gmail-smtp-in.l.google.com'), (10, 'alt1.gmail-smtp-in.l.google.com'), (20, 'alt2.gmail-smtp-in.l.google.com'), (30, 'alt3.gmail-smtp-in.l.google.com'), (40, 'alt4.gmail-smtp-in.l.google.com')], 'mx_fallback_type': None}), + ('pages.github.com', {'mx': [(0, 'pages.github.com')], 'mx_fallback_type': 'A'}), + ], +) +def test_deliverability_found(domain: str, expected_response: str) -> None: + response = validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) + assert response == expected_response - # No MX record, A record fallback, reject-all SPF record. - domain = 'nellis.af.mil' - with pytest.raises(EmailUndeliverableError, match=f'The domain name {domain} does not send email'): - validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) - # No MX or A/AAAA records, but some other DNS records must - # exist such that the response is NOANSWER instead of NXDOMAIN. - domain = 'justtxt.joshdata.me' - with pytest.raises(EmailUndeliverableError, match=f'The domain name {domain} does not accept email'): +@pytest.mark.parametrize( + 'domain,error', + [ + ('xkxufoekjvjfjeodlfmdfjcu.com', 'The domain name {domain} does not exist'), + ('example.com', 'The domain name {domain} does not accept email'), # Null MX record + ('g.mail.com', 'The domain name {domain} does not accept email'), # No MX record but invalid AAAA record fallback (issue #134) + ('nellis.af.mil', 'The domain name {domain} does not send email'), # No MX record, A record fallback, reject-all SPF record. + + # No MX or A/AAAA records, but some other DNS records must + # exist such that the response is NOANSWER instead of NXDOMAIN. + ('justtxt.joshdata.me', 'The domain name {domain} does not accept email'), + ('ipv6only.joshdata.me', 'The domain name {domain} does not accept email'), + ], +) +def test_deliverability_fails(domain: str, error: str) -> None: + with pytest.raises(EmailUndeliverableError, match=error.format(domain=domain)): validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) @@ -51,7 +51,7 @@ def test_deliverability_fails(): ('me@mail.example.com'), ], ) -def test_email_example_reserved_domain(email_input): +def test_email_example_reserved_domain(email_input: str) -> None: # Since these all fail deliverabiltiy from a static list, # DNS deliverability checks do not arise. with pytest.raises(EmailUndeliverableError) as exc_info: @@ -60,22 +60,27 @@ def test_email_example_reserved_domain(email_input): assert re.match(r"The domain name [a-z\.]+ does not (accept email|exist)\.", str(exc_info.value)) is not None -def test_deliverability_dns_timeout(): +def test_deliverability_dns_timeout() -> None: response = validate_email_deliverability('timeout.com', 'timeout.com', dns_resolver=RESOLVER) assert "mx" not in response assert response.get("unknown-deliverability") == "timeout" +def test_timeout_and_resolver() -> None: + with pytest.raises(ValueError, match="It's not valid to pass both timeout and dns_resolver."): + validate_email_deliverability('timeout.com', 'timeout.com', timeout=1, dns_resolver=RESOLVER) + + @pytest.mark.network -def test_caching_dns_resolver(): +def test_caching_dns_resolver() -> None: class TestCache: - def __init__(self): - self.cache = {} + def __init__(self) -> None: + self.cache: dict[Any, Any] = {} - def get(self, key): + def get(self, key: Any) -> Any: return self.cache.get(key) - def put(self, key, value): + def put(self, key: Any, value: Any) -> Any: self.cache[key] = value cache = TestCache() diff --git a/tests/test_main.py b/tests/test_main.py index 579163f..f11087b 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,3 +1,4 @@ +import typing import pytest from email_validator import validate_email, EmailSyntaxError @@ -9,14 +10,22 @@ RESOLVER = MockedDnsResponseData.create_resolver() -def test_dict_accessor(): +def test_dict_accessor() -> None: input_email = "testaddr@example.tld" valid_email = validate_email(input_email, check_deliverability=False) assert isinstance(valid_email.as_dict(), dict) assert valid_email.as_dict()["original"] == input_email -def test_main_single_good_input(monkeypatch, capsys): +def test_dict_accessor_with_domain_address() -> None: + input_email = "me@[127.0.0.1]" + valid_email = validate_email(input_email, check_deliverability=False, allow_domain_literal=True) + assert valid_email.domain == "[127.0.0.1]" + assert isinstance(valid_email.as_dict(), dict) + assert valid_email.as_dict()["domain_address"] == '"IPv4Address(\'127.0.0.1\')"' + + +def test_main_single_good_input(monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: import json test_email = "google@google.com" monkeypatch.setattr('sys.argv', ['email_validator', test_email]) @@ -27,7 +36,7 @@ def test_main_single_good_input(monkeypatch, capsys): assert validate_email(test_email, dns_resolver=RESOLVER).original == output["original"] -def test_main_single_bad_input(monkeypatch, capsys): +def test_main_single_bad_input(monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: bad_email = 'test@..com' monkeypatch.setattr('sys.argv', ['email_validator', bad_email]) validator_command_line_tool(dns_resolver=RESOLVER) @@ -35,7 +44,7 @@ def test_main_single_bad_input(monkeypatch, capsys): assert stdout == 'An email address cannot have a period immediately after the @-sign.\n' -def test_main_multi_input(monkeypatch, capsys): +def test_main_multi_input(monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: import io test_cases = ["google1@google.com", "google2@google.com", "test@.com", "test3@.com"] test_input = io.StringIO("\n".join(test_cases)) @@ -49,7 +58,7 @@ def test_main_multi_input(monkeypatch, capsys): assert test_cases[3] in stdout -def test_bytes_input(): +def test_bytes_input() -> None: input_email = b"testaddr@example.tld" valid_email = validate_email(input_email, check_deliverability=False) assert isinstance(valid_email.as_dict(), dict) @@ -60,8 +69,26 @@ def test_bytes_input(): validate_email(input_email, check_deliverability=False) -def test_deprecation(): +def test_deprecation() -> None: input_email = b"testaddr@example.tld" valid_email = validate_email(input_email, check_deliverability=False) with pytest.deprecated_call(): assert valid_email.email is not None + + +@pytest.mark.parametrize('invalid_email', [ + None, + 12345, + [], + {}, + lambda x: x, +]) +def test_invalid_type(invalid_email: typing.Any) -> None: + with pytest.raises(TypeError, match="email must be str or bytes"): + validate_email(invalid_email, check_deliverability=False) + + +def test_invalid_ascii() -> None: + invalid_email = b'\xd0\xba\xd0\xb2\xd1\x96\xd1\x82\xd0\xbe\xd1\x87\xd0\xba\xd0\xb0@\xd0\xbf\xd0\xbe\xd1\x88\xd1\x82\xd0\xb0.test' + with pytest.raises(EmailSyntaxError, match="The email address is not valid ASCII."): + validate_email(invalid_email, check_deliverability=False) diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 693d7da..9bd1385 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -1,3 +1,5 @@ +from typing import Any + import pytest from email_validator import EmailSyntaxError, \ @@ -5,12 +7,19 @@ ValidatedEmail +def MakeValidatedEmail(**kwargs: Any) -> ValidatedEmail: + ret = ValidatedEmail() + for k, v in kwargs.items(): + setattr(ret, k, v) + return ret + + @pytest.mark.parametrize( 'email_input,output', [ ( 'Abc@example.tld', - ValidatedEmail( + MakeValidatedEmail( local_part='Abc', ascii_local_part='Abc', smtputf8=False, @@ -22,7 +31,7 @@ ), ( 'Abc.123@test-example.com', - ValidatedEmail( + MakeValidatedEmail( local_part='Abc.123', ascii_local_part='Abc.123', smtputf8=False, @@ -34,7 +43,7 @@ ), ( 'user+mailbox/department=shipping@example.tld', - ValidatedEmail( + MakeValidatedEmail( local_part='user+mailbox/department=shipping', ascii_local_part='user+mailbox/department=shipping', smtputf8=False, @@ -46,7 +55,7 @@ ), ( "!#$%&'*+-/=?^_`.{|}~@example.tld", - ValidatedEmail( + MakeValidatedEmail( local_part="!#$%&'*+-/=?^_`.{|}~", ascii_local_part="!#$%&'*+-/=?^_`.{|}~", smtputf8=False, @@ -58,7 +67,7 @@ ), ( 'jeff@臺網中心.tw', - ValidatedEmail( + MakeValidatedEmail( local_part='jeff', ascii_local_part='jeff', smtputf8=False, @@ -70,7 +79,7 @@ ), ( '"quoted local part"@example.org', - ValidatedEmail( + MakeValidatedEmail( local_part='"quoted local part"', ascii_local_part='"quoted local part"', smtputf8=False, @@ -82,7 +91,7 @@ ), ( '"de-quoted.local.part"@example.org', - ValidatedEmail( + MakeValidatedEmail( local_part='de-quoted.local.part', ascii_local_part='de-quoted.local.part', smtputf8=False, @@ -92,17 +101,57 @@ ascii_email='de-quoted.local.part@example.org' ), ), + ( + 'MyName ', + MakeValidatedEmail( + local_part='me', + ascii_local_part='me', + smtputf8=False, + ascii_domain='example.org', + domain='example.org', + normalized='me@example.org', + ascii_email='me@example.org', + display_name="MyName" + ), + ), + ( + 'My Name ', + MakeValidatedEmail( + local_part='me', + ascii_local_part='me', + smtputf8=False, + ascii_domain='example.org', + domain='example.org', + normalized='me@example.org', + ascii_email='me@example.org', + display_name="My Name" + ), + ), + ( + r'"My.\"Na\\me\".Is" <"me \" \\ me"@example.org>', + MakeValidatedEmail( + local_part=r'"me \" \\ me"', + ascii_local_part=r'"me \" \\ me"', + smtputf8=False, + ascii_domain='example.org', + domain='example.org', + normalized=r'"me \" \\ me"@example.org', + ascii_email=r'"me \" \\ me"@example.org', + display_name='My."Na\\me".Is' + ), + ), ], ) -def test_email_valid(email_input, output): +def test_email_valid(email_input: str, output: ValidatedEmail) -> None: # These addresses do not require SMTPUTF8. See test_email_valid_intl_local_part # for addresses that are valid but require SMTPUTF8. Check that it passes with # allow_smtput8 both on and off. emailinfo = validate_email(email_input, check_deliverability=False, allow_smtputf8=False, - allow_quoted_local=True) + allow_quoted_local=True, allow_display_name=True) + assert emailinfo == output assert validate_email(email_input, check_deliverability=False, allow_smtputf8=True, - allow_quoted_local=True) == output + allow_quoted_local=True, allow_display_name=True) == output # Check that the old `email` attribute to access the normalized form still works # if the DeprecationWarning is suppressed. @@ -117,7 +166,7 @@ def test_email_valid(email_input, output): [ ( '伊昭傑@郵件.商務', - ValidatedEmail( + MakeValidatedEmail( local_part='伊昭傑', smtputf8=True, ascii_domain='xn--5nqv22n.xn--lhr59c', @@ -127,7 +176,7 @@ def test_email_valid(email_input, output): ), ( 'राम@मोहन.ईन्फो', - ValidatedEmail( + MakeValidatedEmail( local_part='राम', smtputf8=True, ascii_domain='xn--l2bl7a9d.xn--o1b8dj2ki', @@ -137,7 +186,7 @@ def test_email_valid(email_input, output): ), ( 'юзер@екзампл.ком', - ValidatedEmail( + MakeValidatedEmail( local_part='юзер', smtputf8=True, ascii_domain='xn--80ajglhfv.xn--j1aef', @@ -147,7 +196,7 @@ def test_email_valid(email_input, output): ), ( 'θσερ@εχαμπλε.ψομ', - ValidatedEmail( + MakeValidatedEmail( local_part='θσερ', smtputf8=True, ascii_domain='xn--mxahbxey0c.xn--xxaf0a', @@ -157,7 +206,7 @@ def test_email_valid(email_input, output): ), ( '葉士豪@臺網中心.tw', - ValidatedEmail( + MakeValidatedEmail( local_part='葉士豪', smtputf8=True, ascii_domain='xn--fiqq24b10vi0d.tw', @@ -167,7 +216,7 @@ def test_email_valid(email_input, output): ), ( '葉士豪@臺網中心.台灣', - ValidatedEmail( + MakeValidatedEmail( local_part='葉士豪', smtputf8=True, ascii_domain='xn--fiqq24b10vi0d.xn--kpry57d', @@ -177,7 +226,7 @@ def test_email_valid(email_input, output): ), ( 'jeff葉@臺網中心.tw', - ValidatedEmail( + MakeValidatedEmail( local_part='jeff葉', smtputf8=True, ascii_domain='xn--fiqq24b10vi0d.tw', @@ -187,7 +236,7 @@ def test_email_valid(email_input, output): ), ( 'ñoñó@example.tld', - ValidatedEmail( + MakeValidatedEmail( local_part='ñoñó', smtputf8=True, ascii_domain='example.tld', @@ -197,7 +246,7 @@ def test_email_valid(email_input, output): ), ( '我買@example.tld', - ValidatedEmail( + MakeValidatedEmail( local_part='我買', smtputf8=True, ascii_domain='example.tld', @@ -207,7 +256,7 @@ def test_email_valid(email_input, output): ), ( '甲斐黒川日本@example.tld', - ValidatedEmail( + MakeValidatedEmail( local_part='甲斐黒川日本', smtputf8=True, ascii_domain='example.tld', @@ -217,7 +266,7 @@ def test_email_valid(email_input, output): ), ( 'чебурашкаящик-с-апельсинами.рф@example.tld', - ValidatedEmail( + MakeValidatedEmail( local_part='чебурашкаящик-с-апельсинами.рф', smtputf8=True, ascii_domain='example.tld', @@ -227,7 +276,7 @@ def test_email_valid(email_input, output): ), ( 'उदाहरण.परीक्ष@domain.with.idn.tld', - ValidatedEmail( + MakeValidatedEmail( local_part='उदाहरण.परीक्ष', smtputf8=True, ascii_domain='domain.with.idn.tld', @@ -237,7 +286,7 @@ def test_email_valid(email_input, output): ), ( 'ιωάννης@εεττ.gr', - ValidatedEmail( + MakeValidatedEmail( local_part='ιωάννης', smtputf8=True, ascii_domain='xn--qxaa9ba.gr', @@ -245,15 +294,36 @@ def test_email_valid(email_input, output): normalized='ιωάννης@εεττ.gr', ), ), + ( + '\"s\u0323\u0307\" ', + MakeValidatedEmail( + local_part='\u1E69', + smtputf8=True, + ascii_domain='nfc.tld', + domain='nfc.tld', + normalized='\u1E69@nfc.tld', + display_name='\u1E69' + ), + ), + ( + '@@fullwidth.at', + MakeValidatedEmail( + local_part='@', + smtputf8=True, + ascii_domain='fullwidth.at', + domain='fullwidth.at', + normalized='@@fullwidth.at', + ), + ), ], ) -def test_email_valid_intl_local_part(email_input, output): +def test_email_valid_intl_local_part(email_input: str, output: ValidatedEmail) -> None: # Check that it passes when allow_smtputf8 is True. - assert validate_email(email_input, check_deliverability=False) == output + assert validate_email(email_input, check_deliverability=False, allow_display_name=True) == output # Check that it fails when allow_smtputf8 is False. with pytest.raises(EmailSyntaxError) as exc_info: - validate_email(email_input, allow_smtputf8=False, check_deliverability=False) + validate_email(email_input, allow_smtputf8=False, check_deliverability=False, allow_display_name=True) assert "Internationalized characters before the @-sign are not supported: " in str(exc_info.value) @@ -269,7 +339,7 @@ def test_email_valid_intl_local_part(email_input, output): ('"quoted.with..unicode.λ"@example.com', '"quoted.with..unicode.λ"'), ('"quoted.with.extraneous.\\escape"@example.com', 'quoted.with.extraneous.escape'), ]) -def test_email_valid_only_if_quoted_local_part(email_input, normalized_local_part): +def test_email_valid_only_if_quoted_local_part(email_input: str, normalized_local_part: str) -> None: # These addresses are invalid with the default allow_quoted_local=False option. with pytest.raises(EmailSyntaxError) as exc_info: validate_email(email_input) @@ -283,7 +353,7 @@ def test_email_valid_only_if_quoted_local_part(email_input, normalized_local_par assert validated.local_part == normalized_local_part -def test_domain_literal(): +def test_domain_literal() -> None: # Check parsing IPv4 addresses. validated = validate_email("me@[127.0.0.1]", allow_domain_literal=True) assert validated.domain == "[127.0.0.1]" @@ -303,6 +373,9 @@ def test_domain_literal(): @pytest.mark.parametrize( 'email_input,error_msg', [ + ('hello.world', 'An email address must have an @-sign.'), + ('hello@world', 'The email address has the "full-width" at-sign (@) character instead of a regular at-sign.'), + ('hello﹫world', 'The email address has the "small commercial at" character instead of a regular at-sign.'), ('my@localhost', 'The part after the @-sign is not valid. It should have a period.'), ('my@.leadingdot.com', 'An email address cannot have a period immediately after the @-sign.'), ('my@.leadingfwdot.com', 'An email address cannot have a period immediately after the @-sign.'), @@ -329,28 +402,36 @@ def test_domain_literal(): ('.leadingdot@domain.com', 'An email address cannot start with a period.'), ('twodots..here@domain.com', 'An email address cannot have two periods in a row.'), ('trailingdot.@domain.email', 'An email address cannot have a period immediately before the @-sign.'), - ('me@⒈wouldbeinvalid.com', - "The part after the @-sign contains invalid characters (Codepoint U+2488 not allowed " - "at position 1 in '⒈wouldbeinvalid.com')."), + ('me@⒈wouldbeinvalid.com', "The part after the @-sign contains invalid characters: '⒈'."), + ('me@\u037e.com', "The part after the @-sign contains invalid characters after Unicode normalization: ';'."), + ('me@\u1fef.com', "The part after the @-sign contains invalid characters after Unicode normalization: '`'."), ('@example.com', 'There must be something before the @-sign.'), ('white space@test', 'The email address contains invalid characters before the @-sign: SPACE.'), ('test@white space', 'The part after the @-sign contains invalid characters: SPACE.'), ('\nmy@example.com', 'The email address contains invalid characters before the @-sign: U+000A.'), ('m\ny@example.com', 'The email address contains invalid characters before the @-sign: U+000A.'), ('my\n@example.com', 'The email address contains invalid characters before the @-sign: U+000A.'), + ('me.\u037e@example.com', 'After Unicode normalization: The email address contains invalid characters before the @-sign: \';\'.'), ('test@\n', 'The part after the @-sign contains invalid characters: U+000A.'), ('bad"quotes"@example.com', 'The email address contains invalid characters before the @-sign: \'"\'.'), ('obsolete."quoted".atom@example.com', 'The email address contains invalid characters before the @-sign: \'"\'.'), - ('11111111112222222222333333333344444444445555555555666666666677777@example.com', 'The email address is too long before the @-sign (1 character too many).'), - ('111111111122222222223333333333444444444455555555556666666666777777@example.com', 'The email address is too long before the @-sign (2 characters too many).'), - ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.111111111122222222223333333333444444444455555555556.com', 'The email address is too long (4 characters too many).'), - ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555566.com', 'The email address is too long after the @-sign (1 character too many).'), - ('me@中1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555566.com', 'The email address is too long after the @-sign.'), + ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444444445555555555.com', 'The email address is too long after the @-sign (1 character too many).'), + ('me@中1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444.com', 'The email address is too long after the @-sign (1 byte too many after IDNA encoding).'), + ('me@\uFB2C1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444.com', 'The email address is too long after the @-sign (5 bytes too many after IDNA encoding).'), + ('me@1111111111222222222233333333334444444444555555555666666666677777.com', 'After the @-sign, periods cannot be separated by so many characters (1 character too many).'), + ('me@11111111112222222222333333333344444444445555555556666666666777777.com', 'After the @-sign, periods cannot be separated by so many characters (2 characters too many).'), + ('me@中111111111222222222233333333334444444444555555555666666.com', 'The part after the @-sign is invalid (Label too long).'), + ('meme@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.com', 'The email address is too long (4 characters too many).'), ('my.long.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444.info', 'The email address is too long (2 characters too many).'), - ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333.info', 'The email address is too long (when converted to IDNA ASCII).'), - ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (at least 1 character too many).'), - ('my.λong.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.111111111122222222223333333333444.info', 'The email address is too long (when encoded in bytes).'), - ('my.λong.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (at least 1 character too many).'), + ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (1-2 characters too many).'), + ('my.long.address@\uFB2C111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (1-3 characters too many).'), + ('my.λong.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.111111111122222222223333333333444.info', 'The email address is too long (1 character too many).'), + ('my.λong.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (1-2 characters too many).'), + ('my.\u0073\u0323\u0307.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (1-2 characters too many).'), + ('my.\uFB2C.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444.info', 'The email address is too long (1 character too many).'), + ('my.\uFB2C.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344.info', 'The email address is too long after normalization (1 byte too many).'), + ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333.info', 'The email address is too long when the part after the @-sign is converted to IDNA ASCII (1 byte too many).'), + ('my.λong.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333.info', 'The email address is too long when the part after the @-sign is converted to IDNA ASCII (2 bytes too many).'), ('me@bad-tld-1', 'The part after the @-sign is not valid. It should have a period.'), ('me@bad.tld-2', 'The part after the @-sign is not valid. It is not within a valid top-level domain.'), ('me@xn--0.tld', 'The part after the @-sign is not valid IDNA (Invalid A-label).'), @@ -363,13 +444,39 @@ def test_domain_literal(): ('me@[tag:text]', 'The part after the @-sign contains an invalid address literal tag in brackets.'), ('me@[untaggedtext]', 'The part after the @-sign in brackets is not an IPv4 address and has no address literal tag.'), ('me@[tag:invalid space]', 'The part after the @-sign contains invalid characters in brackets: SPACE.'), + ('', 'A display name and angle brackets around the email address are not permitted here.'), + (' !', 'There can\'t be anything after the email address.'), + ('<\u0338me@example.com', 'The email address contains invalid characters before the @-sign: \'<\'.'), + ('DisplayName ', 'An email address cannot have a hyphen immediately after the @-sign.'), + ('DisplayName ', 'A display name and angle brackets around the email address are not permitted here.'), + ('Display Name ', 'A display name and angle brackets around the email address are not permitted here.'), + ('\"Display Name\" ', 'A display name and angle brackets around the email address are not permitted here.'), + ('Display.Name ', 'The display name contains invalid characters when not quoted: \'.\'.'), + ('\"Display.Name\" ', 'A display name and angle brackets around the email address are not permitted here.'), ], ) -def test_email_invalid_syntax(email_input, error_msg): +def test_email_invalid_syntax(email_input: str, error_msg: str) -> None: # Since these all have syntax errors, deliverability # checks do not arise. with pytest.raises(EmailSyntaxError) as exc_info: - validate_email(email_input) + validate_email(email_input, check_deliverability=False) + assert str(exc_info.value) == error_msg + + +@pytest.mark.parametrize( + 'email_input,error_msg', + [ + ('11111111112222222222333333333344444444445555555555666666666677777@example.com', 'The email address is too long before the @-sign (1 character too many).'), + ('111111111122222222223333333333444444444455555555556666666666777777@example.com', 'The email address is too long before the @-sign (2 characters too many).'), + ('\uFB2C111111122222222223333333333444444444455555555556666666666777777@example.com', 'After Unicode normalization: The email address is too long before the @-sign (2 characters too many).'), + ]) +def test_email_invalid_syntax_strict(email_input: str, error_msg: str) -> None: + # Since these all have syntax errors, deliverability + # checks do not arise. + validate_email(email_input, check_deliverability=False) # pass without strict + with pytest.raises(EmailSyntaxError) as exc_info: + validate_email(email_input, strict=True, check_deliverability=False) assert str(exc_info.value) == error_msg @@ -384,7 +491,7 @@ def test_email_invalid_syntax(email_input, error_msg): ('me@test.test.test'), ], ) -def test_email_invalid_reserved_domain(email_input): +def test_email_invalid_reserved_domain(email_input: str) -> None: # Since these all fail deliverabiltiy from a static list, # DNS deliverability checks do not arise. with pytest.raises(EmailSyntaxError) as exc_info: @@ -408,7 +515,7 @@ def test_email_invalid_reserved_domain(email_input): ('\uFDEF', 'U+FDEF'), # unassigned (Cn) ], ) -def test_email_unsafe_character(s, expected_error): +def test_email_unsafe_character(s: str, expected_error: str) -> None: # Check for various unsafe characters that are permitted by the email # specs but should be disallowed for being unsafe or not sensible Unicode. @@ -428,26 +535,26 @@ def test_email_unsafe_character(s, expected_error): ('"quoted.with..unicode.λ"@example.com', 'Internationalized characters before the @-sign are not supported: \'λ\'.'), ], ) -def test_email_invalid_character_smtputf8_off(email_input, expected_error): +def test_email_invalid_character_smtputf8_off(email_input: str, expected_error: str) -> None: # Check that internationalized characters are rejected if allow_smtputf8=False. with pytest.raises(EmailSyntaxError) as exc_info: validate_email(email_input, allow_smtputf8=False, test_environment=True) assert str(exc_info.value) == expected_error -def test_email_empty_local(): +def test_email_empty_local() -> None: validate_email("@test", allow_empty_local=True, test_environment=True) # This next one might not be desirable. validate_email("\"\"@test", allow_empty_local=True, allow_quoted_local=True, test_environment=True) -def test_email_test_domain_name_in_test_environment(): +def test_email_test_domain_name_in_test_environment() -> None: validate_email("anything@test", test_environment=True) validate_email("anything@mycompany.test", test_environment=True) -def test_case_insensitive_mailbox_name(): +def test_case_insensitive_mailbox_name() -> None: validate_email("POSTMASTER@test", test_environment=True).normalized = "postmaster@test" validate_email("NOT-POSTMASTER@test", test_environment=True).normalized = "NOT-POSTMASTER@test" @@ -627,13 +734,18 @@ def test_case_insensitive_mailbox_name(): ['test.(comment)test@iana.org', 'ISEMAIL_DEPREC_COMMENT'] ] ) -def test_pyisemail_tests(email_input, status): +def test_pyisemail_tests(email_input: str, status: str) -> None: if status == "ISEMAIL_VALID": # All standard email address forms should not raise an exception # with any set of parsing options. validate_email(email_input, test_environment=True) validate_email(email_input, allow_quoted_local=True, allow_domain_literal=True, test_environment=True) + elif status == "ISEMAIL_RFC5322_LOCAL_TOOLONG": + # Requires strict. + with pytest.raises(EmailSyntaxError): + validate_email(email_input, strict=True, test_environment=True) + elif status == "ISEMAIL_RFC5321_QUOTEDSTRING": # Quoted-literal local parts are only valid with an option. with pytest.raises(EmailSyntaxError):