Skip to content

Commit ddf531f

Browse files
authored
BAEL-5194 rm html tags (#11404)
1 parent 7f97bc1 commit ddf531f

4 files changed

Lines changed: 117 additions & 0 deletions

File tree

xml/pom.xml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,22 @@
1414
</parent>
1515

1616
<dependencies>
17+
<dependency>
18+
<groupId>org.jsoup</groupId>
19+
<artifactId>jsoup</artifactId>
20+
<version>${jsoup.version}</version>
21+
</dependency>
22+
<dependency>
23+
<groupId>net.sourceforge.htmlcleaner</groupId>
24+
<artifactId>htmlcleaner</artifactId>
25+
<version>${htmlcleaner.version}</version>
26+
</dependency>
27+
<dependency>
28+
<groupId>net.htmlparser.jericho</groupId>
29+
<artifactId>jericho-html</artifactId>
30+
<version>${jericho.version}</version>
31+
</dependency>
32+
1733
<!-- xml libraries -->
1834
<dependency>
1935
<groupId>org.dom4j</groupId>
@@ -361,6 +377,9 @@
361377
<!-- maven plugins -->
362378
<maven-jibx-plugin.version>1.3.1</maven-jibx-plugin.version>
363379
<maven-compiler-plugin.version>3.8.0</maven-compiler-plugin.version>
380+
<jsoup.version>1.14.3</jsoup.version>
381+
<htmlcleaner.version>2.25</htmlcleaner.version>
382+
<jericho.version>3.4</jericho.version>
364383
</properties>
365384

366385
</project>
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
package com.baeldung.xmlhtml.delhtmltags;
2+
3+
import net.htmlparser.jericho.Renderer;
4+
import net.htmlparser.jericho.Segment;
5+
import net.htmlparser.jericho.Source;
6+
import org.htmlcleaner.CleanerProperties;
7+
import org.htmlcleaner.HtmlCleaner;
8+
import org.jsoup.Jsoup;
9+
import org.junit.jupiter.api.Test;
10+
11+
import java.io.IOException;
12+
import java.net.URISyntaxException;
13+
import java.nio.file.Files;
14+
import java.nio.file.Paths;
15+
16+
class RemoveHtmlTagsLiveTest {
17+
18+
@Test
19+
void givenHtml1_whenRemoveTagsByRegex_thenPrintText() throws IOException, URISyntaxException {
20+
String html = new String(Files.readAllBytes(
21+
(Paths.get(getClass().getResource("/xmlhtml/delhtmltags/example1.html").toURI()))));
22+
String result = html.replaceAll("<[^>]*>", "")
23+
.replaceAll("(?m)^\\s*$", ""); // remove empty and blank lines
24+
System.out.println(result);
25+
}
26+
27+
@Test
28+
void givenHtml2_whenRemoveTagsByRegex_thenPrintText() throws IOException, URISyntaxException {
29+
String html = new String(Files.readAllBytes(
30+
(Paths.get(getClass().getResource("/xmlhtml/delhtmltags/example2.html").toURI()))));
31+
String result = html.replaceAll("<[^>]*>", "");
32+
System.out.println(result);
33+
}
34+
35+
@Test
36+
void givenHtml2_whenRemoveTagsByJsoup_thenPrintText() throws IOException, URISyntaxException {
37+
String html = new String(Files.readAllBytes(
38+
(Paths.get(getClass().getResource("/xmlhtml/delhtmltags/example2.html").toURI()))));
39+
System.out.println(Jsoup.parse(html).text());
40+
}
41+
42+
@Test
43+
void givenHtml2_whenRemoveTagsByHtmlCleaner_thenPrintText() throws IOException, URISyntaxException {
44+
String html = new String(Files.readAllBytes(
45+
(Paths.get(getClass().getResource("/xmlhtml/delhtmltags/example2.html").toURI()))));
46+
CleanerProperties props = new CleanerProperties();
47+
props.setPruneTags("script");
48+
String result = new HtmlCleaner(props).clean(html).getText().toString();
49+
System.out.println(result);
50+
}
51+
52+
@Test
53+
void givenHtml2_whenRemoveTagsByJericho_thenPrintText() throws IOException, URISyntaxException {
54+
String html = new String(Files.readAllBytes(
55+
(Paths.get(getClass().getResource("/xmlhtml/delhtmltags/example2.html").toURI()))));
56+
Source htmlSource = new Source(html);
57+
Segment segment = new Segment(htmlSource, 0, htmlSource.length());
58+
Renderer htmlRender = new Renderer(segment).setIncludeHyperlinkURLs(true);
59+
System.out.println(htmlRender);
60+
}
61+
}
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
2+
"http://www.w3.org/TR/html4/loose.dtd">
3+
<html>
4+
<head>
5+
<title>This is the page title</title>
6+
</head>
7+
<body>
8+
<p>
9+
If the application X doesn't start, the possible causes could be:<br/>
10+
1. <a href="maven.com">Maven</a> is not installed.<br/>
11+
2. Not enough disk space.<br/>
12+
3. Not enough memory.
13+
</p>
14+
</body>
15+
</html>
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
2+
"http://www.w3.org/TR/html4/loose.dtd">
3+
<html>
4+
<head>
5+
<title>This is the page title</title>
6+
</head>
7+
<script>
8+
// some interesting script functions
9+
</script>
10+
<body>
11+
<p>
12+
If the application X doesn't start, the possible causes could be:<br/>
13+
1. <a
14+
id="link"
15+
href="http://maven.apache.org/">
16+
Maven
17+
</a> is not installed.<br/>
18+
2. Not enough (<1G) disk space.<br/>
19+
3. Not enough (<64MB) memory.<br/>
20+
</p>
21+
</body>
22+
</html>

0 commit comments

Comments
 (0)