From 4a8fa113b3ad04aec9af0f376d8860a85b2a8644 Mon Sep 17 00:00:00 2001 From: Andreas Marek Date: Thu, 19 Mar 2026 09:30:59 +1000 Subject: [PATCH] Add dangerous Unicode character detection to pre-commit hook and CI Detect invisible and rendering-altering Unicode characters that can be used for Trojan Source (BiDi override) and glassworm-style attacks. Blocked categories: C0/C1 control characters (except TAB/LF/CR), zero-width characters (U+200B-200D, U+FEFF), and BiDi override/isolate characters (U+202A-202E, U+2066-2069). Uses perl for macOS portability (grep -P is unavailable on macOS). Binary files are skipped automatically. Co-Authored-By: Claude Opus 4.6 (1M context) --- .githooks/pre-commit | 44 ++++++++++++++++++++++- .github/workflows/validate-files.yml | 54 +++++++++++++++++++++++++++- CONTRIBUTING.md | 2 ++ 3 files changed, 98 insertions(+), 2 deletions(-) diff --git a/.githooks/pre-commit b/.githooks/pre-commit index eee3d5d198..a01b246ca9 100755 --- a/.githooks/pre-commit +++ b/.githooks/pre-commit @@ -1,11 +1,14 @@ #!/bin/bash -# Pre-commit hook to enforce Windows compatibility and file size limits +# Pre-commit hook to enforce Windows compatibility, file size limits, +# and dangerous Unicode character detection. # # 1. Windows filenames: prevents characters that are reserved on Windows (< > : " | ? * \) # so the repo can be cloned on Windows systems. # 2. File size: rejects files larger than 10 MB. Many enterprise users mirror graphql-java # into internal repositories that enforce file size limits. +# 3. Dangerous Unicode: detects invisible/control characters that can be used for +# "Trojan Source" (BiDi override), homoglyph, or glassworm-style attacks. # ANSI color codes for better output readability RED='\033[0;31m' @@ -75,6 +78,45 @@ if [ -n "$LARGE_FILES" ]; then ERRORS_FOUND=1 fi +# Check 3: Dangerous Unicode characters (Trojan Source / glassworm attacks) +# Detects: C0/C1 control chars (except TAB, LF, CR), zero-width characters, +# BiDi override/embedding/isolate chars. +# Uses perl for macOS compatibility (grep -P is not available on macOS). +echo " Checking for dangerous Unicode characters..." + +UNICODE_FILES="" +if [ -n "$STAGED_FILES" ]; then + while IFS= read -r file; do + if [ ! -f "$file" ]; then + continue + fi + # Skip binary files + if file --mime-type "$file" 2>/dev/null | grep -qv 'text/'; then + continue + fi + MATCHES=$(perl -CSD -ne ' + if (/[\x{0000}-\x{0008}\x{000B}\x{000C}\x{000E}-\x{001F}\x{007F}-\x{009F}\x{200B}-\x{200D}\x{FEFF}\x{202A}-\x{202E}\x{2066}-\x{2069}]/) { + print " line $.: $_"; + } + ' "$file" 2>/dev/null || true) + if [ -n "$MATCHES" ]; then + UNICODE_FILES="${UNICODE_FILES} - ${file}\n${MATCHES}\n" + fi + done <<< "$STAGED_FILES" +fi + +if [ -n "$UNICODE_FILES" ]; then + echo -e "${RED}Error: The following files contain dangerous Unicode characters:${NC}" + echo -e "$UNICODE_FILES" + echo -e "${YELLOW}These characters are invisible or alter text rendering and can be used for${NC}" + echo -e "${YELLOW}Trojan Source or glassworm-style attacks. Detected character categories:${NC}" + echo -e "${YELLOW} - C0/C1 control characters (U+0000-001F, U+007F-009F, except TAB/LF/CR)${NC}" + echo -e "${YELLOW} - Zero-width characters (U+200B-200D, U+FEFF)${NC}" + echo -e "${YELLOW} - BiDi override/isolate (U+202A-202E, U+2066-2069)${NC}" + echo -e "${YELLOW}Please remove these characters from the affected files.${NC}" + ERRORS_FOUND=1 +fi + # Exit with error if any checks failed if [ "$ERRORS_FOUND" -eq 1 ]; then echo -e "${RED}Pre-commit checks failed. Please fix the issues above and try again.${NC}" diff --git a/.github/workflows/validate-files.yml b/.github/workflows/validate-files.yml index 26be4eacda..3c1cc83e61 100644 --- a/.github/workflows/validate-files.yml +++ b/.github/workflows/validate-files.yml @@ -5,6 +5,8 @@ name: Validate Files # so the repo can be cloned on Windows systems. # 2. File size limits — no files larger than 10 MB. Many enterprise users mirror # graphql-java into internal repositories that enforce file size limits. +# 3. No dangerous Unicode characters — prevents Trojan Source (BiDi override), +# glassworm, and similar attacks using invisible or control characters. on: push: @@ -24,7 +26,7 @@ permissions: jobs: validate-filenames-and-size: runs-on: ubuntu-latest - name: Validate Windows Compatibility and File Sizes + name: Validate Files (Windows names, size, Unicode safety) steps: - name: Checkout code uses: actions/checkout@v6 @@ -96,3 +98,53 @@ jobs: else echo "✓ All files are within the 10MB size limit" fi + + - name: Check for dangerous Unicode characters + run: | + echo "Checking for dangerous Unicode characters (Trojan Source / glassworm)..." + + # Dangerous character ranges: + # U+0000-0008, U+000B-000C, U+000E-001F C0 control chars (except TAB, LF, CR) + # U+007F-009F DELETE + C1 control chars + # U+200B-200D Zero-width space/non-joiner/joiner + # U+FEFF Zero-width no-break space (BOM) + # U+202A-202E BiDi embedding/override (Trojan Source) + # U+2066-2069 BiDi isolate chars (Trojan Source) + + FOUND_FILES="" + + while IFS= read -r file; do + if [ ! -f "$file" ]; then + continue + fi + # Skip binary files + if file --mime-type "$file" 2>/dev/null | grep -qv 'text/'; then + continue + fi + MATCHES=$(perl -CSD -ne ' + if (/[\x{0000}-\x{0008}\x{000B}\x{000C}\x{000E}-\x{001F}\x{007F}-\x{009F}\x{200B}-\x{200D}\x{FEFF}\x{202A}-\x{202E}\x{2066}-\x{2069}]/) { + print " line $.: $_"; + } + ' "$file" 2>/dev/null || true) + if [ -n "$MATCHES" ]; then + echo "::error file=${file}::File contains dangerous Unicode characters" + FOUND_FILES="${FOUND_FILES}${file}:\n${MATCHES}\n" + fi + done <<< "$(git ls-files)" + + if [ -n "$FOUND_FILES" ]; then + echo "" + echo "The following files contain dangerous Unicode characters:" + echo -e "$FOUND_FILES" + echo "" + echo "These invisible or rendering-altering characters can be used for" + echo "Trojan Source or glassworm-style attacks. Detected categories:" + echo " - C0/C1 control characters (U+0000-001F, U+007F-009F, except TAB/LF/CR)" + echo " - Zero-width characters (U+200B-200D, U+FEFF)" + echo " - BiDi override/isolate (U+202A-202E, U+2066-2069)" + echo "" + echo "Please remove these characters from the affected files." + exit 1 + else + echo "✓ No dangerous Unicode characters found" + fi diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 615cee2784..8896cf18c2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -40,6 +40,8 @@ The pre-commit hook will automatically check for: - Splitting them into smaller parts (`.part1`, `.part2`, etc.) - Reducing the file size +- **Dangerous Unicode characters**: Files containing invisible or rendering-altering Unicode characters will be rejected. This protects against [Trojan Source](https://trojansource.codes/) (BiDi override) and glassworm-style attacks. Blocked character categories include C0/C1 control characters, zero-width characters, and BiDi overrides. + To bypass the hooks temporarily (not recommended), use `git commit --no-verify`. ### CI Validation