Skip to content

Commit 52b2b52

Browse files
committed
[plsql,tsql] Fix CPD being case sensitive in PLSQL and TSQL (#4943)
Merge pull request #4943 from oowekyala:issue4396-cpd-case-sensitive
2 parents 3222807 + 12b9ece commit 52b2b52

26 files changed

Lines changed: 2673 additions & 763 deletions

File tree

docs/pages/pmd/devdocs/major_contributions/adding_new_cpd_language.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ title: How to add a new CPD language
33
short_title: Adding a new CPD language
44
tags: [devdocs, extending]
55
summary: How to add a new language module with CPD support.
6-
last_updated: April 2023 (7.0.0)
6+
last_updated: June 2024 (7.3.0)
77
permalink: pmd_devdocs_major_adding_new_cpd_language.html
88
author: Matías Fraga, Clément Fournier
99
---
@@ -45,8 +45,15 @@ Use the following guide to set up a new language module that supports CPD.
4545
}
4646
```
4747

48+
- If your language is case-insensitive, then you might want to overwrite `getImage(AntlrToken)`. There you can
49+
change each token e.g. into uppercase, so that CPD sees the same strings and can find duplicates even when
50+
the casing differs. See {% jdoc tsql::lang.tsql.cpd.TSqlCpdLexer %} for an example. You will also need a
51+
"CaseChangingCharStream", so that antlr itself is case-insensitive.
4852
- For JavaCC grammars, place your grammar in `etc/grammar` and edit the `pom.xml` like the [Python implementation](https://github.com/pmd/pmd/blob/master/pmd-python/pom.xml) does.
4953
You can then subclass {% jdoc core::cpd.impl.JavaccCpdLexer %} instead of AntlrCpdLexer.
54+
- If your JavaCC based language is case-insensitive (option `IGNORE_CASE=true`), then you need to implement
55+
{%jdoc core::lang.ast.impl.javacc.JavaccTokenDocument.TokenDocumentBehavior %}, which can change each token
56+
e.g. into uppercase. See {%jdoc plsql::lang.plsql.ast.PLSQLParser %} for an example.
5057
- For any other scenario just implement the interface however you can. Look at the Scala or Apex module for existing implementations.
5158

5259
3. Create a {% jdoc core::lang.Language %} implementation, and make it implement {% jdoc core::cpd.CpdCapableLanguage %}.

docs/pages/release_notes.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ See also [Maven PMD Plugin]({{ baseurl }}pmd_userdocs_tools_maven.html).
3939
* cli
4040
* [#2827](https://github.com/pmd/pmd/issues/2827): \[cli] Consider processing errors in exit status
4141
* core
42+
* [#4396](https://github.com/pmd/pmd/issues/4396): \[core] CPD is always case sensitive
4243
* [#4992](https://github.com/pmd/pmd/pull/4992): \[core] CPD: Include processing errors in XML report
4344
* [#5066](https://github.com/pmd/pmd/issues/5066): \[core] CPD throws java.lang.OutOfMemoryError: Java heap space (since 7.1.0)
4445
* apex
@@ -113,11 +114,18 @@ read the XML format should be updated.
113114
* {% jdoc !!core::cpd.CPDConfiguration#isSkipLexicalErrors() %} and {% jdoc core::cpd.CPDConfiguration#setSkipLexicalErrors(boolean) %}:
114115
Use {%jdoc core::AbstractConfiguration#setFailOnError(boolean) %} to control whether to ignore errors or fail the build.
115116
* {%jdoc !!core::cpd.XMLOldRenderer %} (the CPD format "xmlold").
117+
* The constructor
118+
{%jdoc !!core::lang.ast.impl.antlr4.AntlrToken#AntlrToken(org.antlr.v4.runtime.Token,core::lang.ast.impl.antlr4.AntlrToken,core::lang.document.TextDocument) %}
119+
shouldn't be used directly. Use {%jdoc core::lang.ast.impl.antlr4.AntlrTokenManager %} instead.
116120
* pmd-java
117121
* {% jdoc !!java::lang.java.ast.ASTResource#getStableName() %} and the corresponding attribute `@StableName`.
118122
* {%jdoc !!java::lang.java.ast.ASTRecordPattern#getVarId() %} This method was added here by mistake. Record
119123
patterns don't declare a pattern variable for the whole pattern, but rather for individual record
120124
components, which can be accessed via {%jdoc java::lang.java.ast.ASTRecordPattern#getComponentPatterns() %}.
125+
* pmd-plsql
126+
* {%jdoc plsql::lang.plsql.ast.PLSQLParserImpl %} is deprecated now. It should have been package-private
127+
because this is an implementation class that should not be used directly.
128+
* The node {%jdoc plsql::lang.plsql.ast.ASTKEYWORD_UNRESERVED %} is deprecated and is now removed from the AST.
121129

122130
#### Breaking changes: pmd-compat6 removed
123131

pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/antlr4/AntlrToken.java

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,13 @@
1717
*/
1818
public class AntlrToken implements GenericToken<AntlrToken> {
1919

20-
private final Token token;
2120
private final AntlrToken previousComment;
2221
private final TextDocument textDoc;
22+
private final String image;
23+
private final int endOffset;
24+
private final int startOffset;
25+
private final int channel;
26+
private final int kind;
2327
AntlrToken next;
2428

2529

@@ -29,11 +33,18 @@ public class AntlrToken implements GenericToken<AntlrToken> {
2933
* @param token The antlr token implementation
3034
* @param previousComment The previous comment
3135
* @param textDoc The text document
36+
*
37+
* @deprecated Don't create antlr tokens directly, use an {@link AntlrTokenManager}
3238
*/
39+
@Deprecated
3340
public AntlrToken(final Token token, final AntlrToken previousComment, TextDocument textDoc) {
34-
this.token = token;
3541
this.previousComment = previousComment;
3642
this.textDoc = textDoc;
43+
this.image = token.getText();
44+
this.startOffset = token.getStartIndex();
45+
this.endOffset = token.getStopIndex() + 1; // exclusive
46+
this.channel = token.getChannel();
47+
this.kind = token.getType();
3748
}
3849

3950
@Override
@@ -48,13 +59,13 @@ public AntlrToken getPreviousComment() {
4859

4960
@Override
5061
public CharSequence getImageCs() {
51-
return token.getText();
62+
return image;
5263
}
5364

5465
/** Returns a text region with the coordinates of this token. */
5566
@Override
5667
public TextRegion getRegion() {
57-
return TextRegion.fromBothOffsets(token.getStartIndex(), token.getStopIndex() + 1);
68+
return TextRegion.fromBothOffsets(startOffset, endOffset);
5869
}
5970

6071
@Override
@@ -74,14 +85,14 @@ public int compareTo(AntlrToken o) {
7485

7586
@Override
7687
public int getKind() {
77-
return token.getType();
88+
return kind;
7889
}
7990

8091
public boolean isHidden() {
8192
return !isDefault();
8293
}
8394

8495
public boolean isDefault() {
85-
return token.getChannel() == Lexer.DEFAULT_TOKEN_CHANNEL;
96+
return channel == Lexer.DEFAULT_TOKEN_CHANNEL;
8697
}
8798
}

pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/javacc/JavaccToken.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,16 @@ public String getImage() {
147147
return image.toString();
148148
}
149149

150+
/**
151+
* Returns the original text of the token.
152+
* The image may be normalized, e.g. for case-insensitive languages.
153+
*
154+
* @since 7.3.0
155+
*/
156+
public Chars getText() {
157+
return document.getTextDocument().sliceOriginalText(getRegion());
158+
}
159+
150160
@Override
151161
public final TextRegion getRegion() {
152162
return TextRegion.fromBothOffsets(startOffset, endOffset);

pmd-plsql/etc/grammar/PLSQL.jjt

Lines changed: 49 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,11 @@ import net.sourceforge.pmd.lang.plsql.ast.internal.ParsingExclusion;
171171
import java.util.ArrayList;
172172
import java.util.List;
173173

174+
/**
175+
* @deprecated PLSQLParserImpl should have been package private because this is an implementation class
176+
* that should not be used directly.
177+
*/
178+
@Deprecated
174179
public class PLSQLParserImpl {
175180

176181
/**
@@ -224,7 +229,10 @@ public class PLSQLParserImpl {
224229
* Usage: <code>LOOKAHEAD({isKeyword("WAIT")}) KEYWORD("WAIT")</code>
225230
*/
226231
private boolean isKeyword(String keyword) {
227-
return getToken(1).kind == IDENTIFIER && getToken(1).getImage().equalsIgnoreCase(keyword);
232+
return getToken(1).kind == IDENTIFIER
233+
&& getToken(1).getImage().equalsIgnoreCase(keyword)
234+
// quoted identifiers are excluded
235+
&& getToken(1).getText().charAt(0) != '"';
228236
}
229237

230238
}
@@ -1715,18 +1723,18 @@ ASTSimpleExpression SimpleExpression() :
17151723
LOOKAHEAD(4) ID() "." ( <CURRVAL> | <NEXTVAL> )
17161724
|
17171725
LOOKAHEAD(6)
1718-
SchemaName() { sb.append(token.getImage()); } "." { sb.append(token.getImage()); }
1719-
TableName() { sb.append(token.getImage()); } "." { sb.append(token.getImage()); }
1720-
( "*" | Column() ) { sb.append(token.getImage()); }
1726+
SchemaName() { sb.append(token.getText()); } "." { sb.append(token.getImage()); }
1727+
TableName() { sb.append(token.getText()); } "." { sb.append(token.getImage()); }
1728+
( "*" | Column() ) { sb.append(token.getText()); }
17211729
|
17221730
LOOKAHEAD(4)
1723-
TableName() { sb.append(token.getImage()); } "." { sb.append(token.getImage()); }
1724-
( "*" | Column() ) { sb.append(token.getImage()); }
1731+
TableName() { sb.append(token.getText()); } "." { sb.append(token.getImage()); }
1732+
( "*" | Column() ) { sb.append(token.getText()); }
17251733
|
17261734
// Named Cursor: https://docs.oracle.com/en/database/oracle/oracle-database/18/lnpls/named-cursor-attribute.html#GUID-CD8D8415-FF19-4D81-99BA-7825FD40CC96
17271735
// Implicit Cursor: https://docs.oracle.com/en/database/oracle/oracle-database/18/lnpls/implicit-cursor-attribute.html#GUID-5A938EE7-E8D2-468C-B60F-81898F110BE1
17281736
LOOKAHEAD(3)
1729-
Column() { sb.append(token.getImage()); } "%" ( LOOKAHEAD({isKeyword("isopen")}) KEYWORD("ISOPEN")
1737+
Column() { sb.append(token.getText()); } "%" ( LOOKAHEAD({isKeyword("isopen")}) KEYWORD("ISOPEN")
17301738
| LOOKAHEAD({isKeyword("found")}) KEYWORD("FOUND")
17311739
| LOOKAHEAD({isKeyword("notfound")}) KEYWORD("NOTFOUND")
17321740
| LOOKAHEAD({isKeyword("rowcount")}) KEYWORD("ROWCOUNT")
@@ -1735,7 +1743,7 @@ ASTSimpleExpression SimpleExpression() :
17351743
) { sb.append('%').append(token.getImage()); }
17361744
|
17371745
LOOKAHEAD(2)
1738-
( "*" | Column() ) { sb.append(token.getImage()); }
1746+
( "*" | Column() ) { sb.append(token.getText()); }
17391747
)
17401748
{
17411749
jjtThis.setImage(sb.toString());
@@ -1744,11 +1752,11 @@ ASTSimpleExpression SimpleExpression() :
17441752
}
17451753

17461754
ASTOuterJoinExpression OuterJoinExpression() :
1747-
{ StringBuilder sb = new StringBuilder(); }
1755+
{ StringBuilder sb = new StringBuilder(); PLSQLNode node; }
17481756
{
1749-
[ LOOKAHEAD(6) SchemaName() { sb.append(token.getImage()); } "." { sb.append(token.getImage()); } ]
1750-
[ LOOKAHEAD(4) TableName() { sb.append(token.getImage()); } "." { sb.append(token.getImage()); } ]
1751-
Column() { sb.append(token.getImage()); }
1757+
[ LOOKAHEAD(6) node=SchemaName() { sb.append(node.getImage()); } "." { sb.append(token.getImage()); } ]
1758+
[ LOOKAHEAD(4) node=TableName() { sb.append(node.getImage()); } "." { sb.append(token.getImage()); } ]
1759+
node=Column() { sb.append(node.getImage()); }
17521760
"(" "+" ")"
17531761

17541762
{
@@ -5324,24 +5332,18 @@ TOKEN :
53245332
"\\"
53255333
>
53265334
|
5335+
< #ID_SIMPLE: ("$" | ":" | <LETTER>) ( <LETTER> | <DIGIT> | "$" | "_" | "#" )* >
5336+
|
53275337
< IDENTIFIER:
5328-
( ("$" | ":" | <LETTER>) ( <LETTER> | <DIGIT> | "$" | "_" | "#" )* ) // 2006-05-17 - Matthias Hendler - Bind variablen werden nun als Identifier akzeptiert.
5329-
//SRT Does NOT seem to like identifiers 2 or fewer characters( <LETTER> ( <LETTER> ) )
5330-
//( <LETTER> ( <DIGIT> ) )
5331-
//( <LETTER> ( "$" ) )
5332-
//( <LETTER> ( "_" ) )
5333-
//( <LETTER> ( "#" ) )
5338+
<ID_SIMPLE>
53345339
|
5335-
(
5336-
<LEXICAL_PARAMETER> ( <LETTER> | <DIGIT> | "$" | "_" | "#" )*
5337-
)
5338-
|
5339-
( (<LETTER> | "$" ) ( <LETTER> | <DIGIT> | "$" | "_" | "#" )* )
5340-
|
5341-
( "\"" <LETTER> ( <LETTER> | <DIGIT> | "$" | "_" | "#" )* "\"" )
5340+
( <LEXICAL_PARAMETER> ( <LETTER> | <DIGIT> | "$" | "_" | "#" )* )
5341+
|
5342+
// todo separate quoted identifier into other token
5343+
( "\"" <ID_SIMPLE> "\"" )
53425344
>
53435345
|
5344-
< LEXICAL_PARAMETER:
5346+
< #LEXICAL_PARAMETER:
53455347
(
53465348
("&&" | "&")
53475349
(
@@ -5365,12 +5367,6 @@ TOKEN :
53655367
|
53665368
< QUOTED_LITERAL: "\"" (<_WHATEVER_CHARACTER_WO_QUOTE> | <SPECIAL_CHARACTERS> | "\\\"")* "\"" >
53675369
|
5368-
< SQLDATA_CLASS: "SQLData" >
5369-
|
5370-
< CUSTOMDATUM_CLASS: "CustomDatum" >
5371-
|
5372-
< ORADATA_CLASS: "OraData" >
5373-
|
53745370
< JAVA_INTERFACE_CLASS: ( "SQLData" | "CustomDatum" | "OraData" ) >
53755371
//|
53765372
//< #BOOLEAN_LITERAL: "TRUE" | "FALSE" >
@@ -5404,8 +5400,8 @@ TOKEN :
54045400
| (["q","Q"]) "'<" (~[">"] | ">" ~["'"] )* ">"
54055401
| (["q","Q"]) "'(" (~[")"] | ")" ~["'"] )* ")"
54065402
>
5407-
| <(["n","N"])? "'" (<_WHATEVER_CHARACTER_WO_APOSTROPHE> | <SPECIAL_CHARACTERS> | "''")*> : IN_STRING_LITERAL_TOKENIZE
5408-
| <(["n","N"])? <_ALTERNATIVE_QUOTING_STRING_LITERAL>> : IN_STRING_LITERAL_TOKENIZE
5403+
| <(["n","N"])? "'" (<_WHATEVER_CHARACTER_WO_APOSTROPHE> | <SPECIAL_CHARACTERS> | "''")*> { input_stream.backup(1); } : IN_STRING_LITERAL_TOKENIZE
5404+
| <(["n","N"])? <_ALTERNATIVE_QUOTING_STRING_LITERAL>> { input_stream.backup(1); } : IN_STRING_LITERAL_TOKENIZE
54095405

54105406
// special handling for custom quote delimiters
54115407
| <(["n","N"])? (["q","Q"]) "'" (~[" ", "\t", "\r", "\n", "[", "{", "<", "("])> : IN_STRING_LITERAL
@@ -5420,12 +5416,14 @@ TOKEN :
54205416
}
54215417
int beforeQuote = image.charAt(image.length() - 2);
54225418
if (quoteDelimiter == beforeQuote) {
5423-
input_stream.backup(1);
5419+
input_stream.backup(2);
54245420
SwitchTo(IN_STRING_LITERAL_TOKENIZE);
54255421
}
54265422
}
54275423
}
5428-
<IN_STRING_LITERAL_TOKENIZE> TOKEN : { <STRING_LITERAL: "'"> : DEFAULT }
5424+
<IN_STRING_LITERAL_TOKENIZE> TOKEN : {
5425+
<STRING_LITERAL: ~[] "'"> : DEFAULT
5426+
}
54295427

54305428

54315429
/**
@@ -5544,11 +5542,15 @@ void KEYWORD(String id) #void:
55445542
* PL/SQL Keywords. They can be used as ordinary identifiers, but it is not recommended.
55455543
*
55465544
* https://docs.oracle.com/en/database/oracle/oracle-database/23/lnpls/plsql-language-fundamentals.html#GUID-53E09662-5AD4-4530-8C6B-FF3F7C7430D5
5545+
*
5546+
* @deprecated This is only used to generate a node class
55475547
*/
5548-
ASTKEYWORD_UNRESERVED KEYWORD_UNRESERVED (): {}
5548+
// @Deprecated
5549+
ASTKEYWORD_UNRESERVED KEYWORD_UNRESERVED (): {}{KEYWORD_NOT_RESERVED() {return jjtThis;}}
5550+
5551+
private void KEYWORD_NOT_RESERVED () #void: {}
55495552
{
55505553
// PL/SQL UNRESERVED KEYWORDS - V$RESERVED.RESERVED='N'
5551-
(
55525554
"REF" | "LAST" | "TRIM" | "OVER" | "UNBOUNDED" | "PRECEDING" | "FOLLOWING" | "WITHIN" |
55535555
"OVERFLOW" | "ERROR" | "WITHOUT" | "COUNT" | "SUBPARTITION" | "LOG" | "ERRORS" | "REJECT" | "UNLIMITED" |
55545556
<FALSE>
@@ -6601,9 +6603,6 @@ ASTKEYWORD_UNRESERVED KEYWORD_UNRESERVED (): {}
66016603
| <RENAME> //Although RENAME is an Oracle reserved word, it may be used as a PL/SQL name.
66026604
| <RELEASE> //Although RELEASE is an Oracle reserved word, it may be used as a PL/SQL name.
66036605
| <INLINE> // PRAGMA INLINE is not a PLSQL reserved word
6604-
)
6605-
6606-
{ jjtThis.setImage(token.getImage()) ; jjtThis.value = token ; return jjtThis ; }
66076606
}
66086607

66096608
//SRT 2011-04-17 - END */
@@ -6614,7 +6613,7 @@ ASTID ID(): {}
66146613
{
66156614
(<IDENTIFIER>
66166615
| <QUOTED_LITERAL>
6617-
| KEYWORD_UNRESERVED() //SRT 2011-04-17
6616+
| KEYWORD_NOT_RESERVED() //SRT 2011-04-17
66186617
/*KEYWORDS_UNRESERVED
66196618
|<EXTRACT> | <FALSE> | <TRUE> | <SECOND> | <MINUTE> | <HOUR> | <DAY> | <MONTH> | <YEAR>
66206619
| <NO> |<ROW> | <CURSOR>
@@ -6766,7 +6765,7 @@ ASTID ID(): {}
67666765
//20120427 | <OID>
67676766
//20120428 | <AGGREGATE>
67686767
//| <SYS_REFCURSOR>
6769-
| <JAVA_INTERFACE_CLASS> | <SQLDATA_CLASS> | <CUSTOMDATUM_CLASS> | <ORADATA_CLASS>
6768+
| <JAVA_INTERFACE_CLASS>
67706769
//20120427 | <EXTERNAL>
67716770
//SRT 20090608 ALTER TYPE key words
67726771
//| <ADD>
@@ -6776,7 +6775,7 @@ ASTID ID(): {}
67766775
//20120427 | <MODIFY>
67776776
//SRT 20110524 | <SELF>
67786777
)
6779-
{ jjtThis.setImage(token.getImage()) ; jjtThis.value = token ; return jjtThis ; }
6778+
{ jjtThis.setImage(token.getText().toString()) ; jjtThis.value = token ; return jjtThis ; }
67806779
}
67816780

67826781
/**
@@ -6787,7 +6786,7 @@ ASTUnqualifiedID UnqualifiedID(): {}
67876786
(
67886787
<IDENTIFIER>
67896788
| <QUOTED_LITERAL>
6790-
| KEYWORD_UNRESERVED() //SRT 2011-04-17
6789+
| KEYWORD_NOT_RESERVED() //SRT 2011-04-17
67916790
//20120501 | <INTERVAL>
67926791
| <MOD>
67936792
| <RAW>
@@ -6799,7 +6798,7 @@ ASTUnqualifiedID UnqualifiedID(): {}
67996798
| <LOOP>
68006799
//| <RESULT>
68016800
)
6802-
{ jjtThis.setImage(token.getImage()) ; jjtThis.value = token ; return jjtThis ; }
6801+
{ jjtThis.setImage(token.getText().toString()) ; jjtThis.value = token ; return jjtThis ; }
68036802
}
68046803

68056804
/**
@@ -6810,7 +6809,7 @@ ASTQualifiedID QualifiedID(): {}
68106809
(
68116810
<IDENTIFIER>
68126811
| <QUOTED_LITERAL>
6813-
| KEYWORD_UNRESERVED() //SRT 2011-04-17
6812+
| KEYWORD_NOT_RESERVED() //SRT 2011-04-17
68146813
/*
68156814
| <EXTRACT> --Unreserved Key Word
68166815
| <FALSE> --Unreserved Key Word
@@ -6999,7 +6998,7 @@ ASTQualifiedID QualifiedID(): {}
69996998
//| <CHARACTER>
70006999
//| <LIMIT>
70017000
)
7002-
{ jjtThis.setImage(token.getImage()) ; jjtThis.value = token ; return jjtThis ; }
7001+
{ jjtThis.setImage(token.getText().toString()) ; jjtThis.value = token ; return jjtThis ; }
70037002
}
70047003

70057004
ASTTypeKeyword TypeKeyword(): {}
@@ -7032,15 +7031,15 @@ ASTTypeKeyword TypeKeyword(): {}
70327031
<TIMEZONE_REGION> | <TIMEZONE_ABBR> | <TIMEZONE_MINUTE> | <TIMEZONE_HOUR> | <DOUBLE> | <PRECISION> |
70337032
<VARRAY> |
70347033
<YEAR> | <LOCAL> | <WITH> | <ZONE>
7035-
| <JAVA_INTERFACE_CLASS> | <SQLDATA_CLASS> | <CUSTOMDATUM_CLASS> | <ORADATA_CLASS>
7034+
| <JAVA_INTERFACE_CLASS>
70367035
)
70377036
{ jjtThis.setImage(token.getImage()) ; jjtThis.value = token ; return jjtThis ; }
70387037
}
70397038

70407039
ASTJavaInterfaceClass JavaInterfaceClass(): {}
70417040
{
70427041
(
7043-
<SQLDATA_CLASS> | <CUSTOMDATUM_CLASS> | <ORADATA_CLASS>
7042+
<JAVA_INTERFACE_CLASS>
70447043
)
70457044
{ jjtThis.setImage(token.getImage()) ; jjtThis.value = token ; return jjtThis ; }
70467045
}

0 commit comments

Comments
 (0)