Skip to content

Commit a9a55bb

Browse files
authored
Merge pull request eugenp#6974 from Doha2012/master
remove stopwords from string
2 parents d43790d + a4762fc commit a9a55bb

File tree

5 files changed

+5182
-55
lines changed

5 files changed

+5182
-55
lines changed

java-strings-2/pom.xml

Lines changed: 0 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -15,28 +15,6 @@
1515
</parent>
1616

1717
<dependencies>
18-
<dependency>
19-
<groupId>commons-io</groupId>
20-
<artifactId>commons-io</artifactId>
21-
<version>${commons-io.version}</version>
22-
</dependency>
23-
<dependency>
24-
<groupId>log4j</groupId>
25-
<artifactId>log4j</artifactId>
26-
<version>${log4j.version}</version>
27-
</dependency>
28-
<dependency>
29-
<groupId>commons-codec</groupId>
30-
<artifactId>commons-codec</artifactId>
31-
<version>${commons-codec.version}</version>
32-
</dependency>
33-
<!-- test scoped -->
34-
<dependency>
35-
<groupId>org.assertj</groupId>
36-
<artifactId>assertj-core</artifactId>
37-
<version>${assertj.version}</version>
38-
<scope>test</scope>
39-
</dependency>
4018
<dependency>
4119
<groupId>org.openjdk.jmh</groupId>
4220
<artifactId>jmh-core</artifactId>
@@ -57,11 +35,6 @@
5735
<artifactId>guava</artifactId>
5836
<version>${guava.version}</version>
5937
</dependency>
60-
<dependency>
61-
<groupId>com.vdurmont</groupId>
62-
<artifactId>emoji-java</artifactId>
63-
<version>${emoji-java.version}</version>
64-
</dependency>
6538
<dependency>
6639
<groupId>org.apache.commons</groupId>
6740
<artifactId>commons-lang3</artifactId>
@@ -73,38 +46,18 @@
7346
<version>${junit.version}</version>
7447
<scope>test</scope>
7548
</dependency>
76-
<dependency>
77-
<groupId>org.junit.jupiter</groupId>
78-
<artifactId>junit-jupiter-api</artifactId>
79-
<version>${junit-jupiter-api.version}</version>
80-
<scope>test</scope>
81-
</dependency>
82-
8349
<dependency>
8450
<groupId>org.hamcrest</groupId>
8551
<artifactId>hamcrest-library</artifactId>
8652
<version>${org.hamcrest.version}</version>
8753
<scope>test</scope>
8854
</dependency>
89-
90-
<!-- Added for password generation -->
91-
<dependency>
92-
<groupId>org.passay</groupId>
93-
<artifactId>passay</artifactId>
94-
<version>${passay.version}</version>
95-
</dependency>
9655
<dependency>
9756
<groupId>org.apache.commons</groupId>
9857
<artifactId>commons-text</artifactId>
9958
<version>${commons-text.version}</version>
10059
</dependency>
10160

102-
<dependency>
103-
<groupId>org.ahocorasick</groupId>
104-
<artifactId>ahocorasick</artifactId>
105-
<version>${ahocorasick.version}</version>
106-
</dependency>
107-
10861
</dependencies>
10962

11063
<build>
@@ -131,18 +84,10 @@
13184
</build>
13285

13386
<properties>
134-
<!-- util -->
13587
<commons-lang3.version>3.8.1</commons-lang3.version>
136-
<commons-codec.version>1.10</commons-codec.version>
137-
<!-- testing -->
138-
<assertj.version>3.6.1</assertj.version>
13988
<icu4j.version>61.1</icu4j.version>
14089
<guava.version>27.0.1-jre</guava.version>
141-
<emoji-java.version>4.0.0</emoji-java.version>
142-
<junit-jupiter-api.version>5.3.1</junit-jupiter-api.version>
143-
<passay.version>1.3.1</passay.version>
14490
<commons-text.version>1.4</commons-text.version>
145-
<ahocorasick.version>0.4.0</ahocorasick.version>
14691
</properties>
14792

14893
</project>
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
package com.baeldung.string.performance;
2+
3+
import java.io.IOException;
4+
import java.nio.file.Files;
5+
import java.nio.file.Paths;
6+
import java.util.ArrayList;
7+
import java.util.List;
8+
import java.util.concurrent.TimeUnit;
9+
import java.util.stream.Collectors;
10+
import java.util.stream.Stream;
11+
12+
import org.openjdk.jmh.annotations.Benchmark;
13+
import org.openjdk.jmh.annotations.BenchmarkMode;
14+
import org.openjdk.jmh.annotations.Fork;
15+
import org.openjdk.jmh.annotations.Mode;
16+
import org.openjdk.jmh.annotations.OutputTimeUnit;
17+
import org.openjdk.jmh.annotations.Scope;
18+
import org.openjdk.jmh.annotations.Setup;
19+
import org.openjdk.jmh.annotations.State;
20+
21+
22+
@Fork(value = 3, warmups = 1)
23+
@State(Scope.Benchmark)
24+
@BenchmarkMode(Mode.AverageTime)
25+
@OutputTimeUnit(TimeUnit.MILLISECONDS)
26+
public class RemovingStopwordsPerformanceComparison {
27+
28+
private String data;
29+
30+
private List<String> stopwords;
31+
32+
private String stopwordsRegex;
33+
34+
35+
public static void main(String[] args) throws Exception {
36+
org.openjdk.jmh.Main.main(args);
37+
}
38+
39+
@Setup
40+
public void setup() throws IOException {
41+
data = new String(Files.readAllBytes(Paths.get("src/main/resources/shakespeare-hamlet.txt")));
42+
data = data.toLowerCase();
43+
stopwords = Files.readAllLines(Paths.get("src/main/resources/english_stopwords.txt"));
44+
stopwordsRegex = stopwords.stream().collect(Collectors.joining("|", "\\b(", ")\\b\\s?"));
45+
}
46+
47+
@Benchmark
48+
public String removeManually() {
49+
String[] allWords = data.split(" ");
50+
StringBuilder builder = new StringBuilder();
51+
for(String word:allWords) {
52+
if(! stopwords.contains(word)) {
53+
builder.append(word);
54+
builder.append(' ');
55+
}
56+
}
57+
return builder.toString().trim();
58+
}
59+
60+
@Benchmark
61+
public String removeAll() {
62+
ArrayList<String> allWords = Stream.of(data.split(" "))
63+
.collect(Collectors.toCollection(ArrayList<String>::new));
64+
allWords.removeAll(stopwords);
65+
return allWords.stream().collect(Collectors.joining(" "));
66+
}
67+
68+
@Benchmark
69+
public String replaceRegex() {
70+
return data.replaceAll(stopwordsRegex, "");
71+
}
72+
73+
}
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
i
2+
me
3+
my
4+
myself
5+
we
6+
our
7+
ours
8+
ourselves
9+
you
10+
your
11+
yours
12+
yourself
13+
yourselves
14+
he
15+
him
16+
his
17+
himself
18+
she
19+
her
20+
hers
21+
herself
22+
it
23+
its
24+
itself
25+
they
26+
them
27+
their
28+
theirs
29+
themselves
30+
what
31+
which
32+
who
33+
whom
34+
this
35+
that
36+
these
37+
those
38+
am
39+
is
40+
are
41+
was
42+
were
43+
be
44+
been
45+
being
46+
have
47+
has
48+
had
49+
having
50+
do
51+
does
52+
did
53+
doing
54+
a
55+
an
56+
the
57+
and
58+
but
59+
if
60+
or
61+
because
62+
as
63+
until
64+
while
65+
of
66+
at
67+
by
68+
for
69+
with
70+
about
71+
against
72+
between
73+
into
74+
through
75+
during
76+
before
77+
after
78+
above
79+
below
80+
to
81+
from
82+
up
83+
down
84+
in
85+
out
86+
on
87+
off
88+
over
89+
under
90+
again
91+
further
92+
then
93+
once
94+
here
95+
there
96+
when
97+
where
98+
why
99+
how
100+
all
101+
any
102+
both
103+
each
104+
few
105+
more
106+
most
107+
other
108+
some
109+
such
110+
no
111+
nor
112+
not
113+
only
114+
own
115+
same
116+
so
117+
than
118+
too
119+
very
120+
s
121+
t
122+
can
123+
will
124+
just
125+
don
126+
should
127+
now

0 commit comments

Comments
 (0)