import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.PrintWriter; import java.net.URL; import java.net.URLConnection; import java.util.Scanner; import org.apache.commons.lang3.StringEscapeUtils; import edu.stanford.nlp.tagger.maxent.MaxentTagger; public class PageTagger { /** * Takes a String as input and returns the tagged text, another String, as output. * Internally the method should use thetagString method of MaxentTagger to tag the String. * @param s * @return tagged String * @throws ClassNotFoundException * @throws IOException */ public static String tagText (String s) throws ClassNotFoundException, IOException{ MaxentTagger tagger = new MaxentTagger("Taggers/english-left3words-distsim.tagger"); StringBuilder cc=new StringBuilder(); // Cleaning the string by removing more than one white space s=s.replaceAll("\\s+", " ").trim(); /** * Factored parsing of sentences up to 200 words requires around 3GB of memory. * If string input length is more than 200, we break it into substring where * we can find the last word within 200 character limit. Then I apply tagString() method. * After performed I append the String Builder. * I am doing it to avoid memory leak. * * Reference: "To be able to handle longer sentences, you need more (to parse sentences * up to 100 words, you need 400 MB). For running the Factored Parser, 600 MB is needed * for dealing with sentences up to 40 words. Factored parsing of sentences up to "200" * words requires around 3GB of memory." * Website: * http://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/parser/lexparser/package-summary.html * http://nlp.stanford.edu/software/parser-faq.shtml * */ if(s.length()<=200){ String sample = s;//"This is a sample text"; String tagged = tagger.tagString(sample); System.out.println("tag text method ended"); return tagged; }else{ int fixed=200; int firstIndex=0; int lastIndex=0; while((firstIndexs.length())){ break; } if(lastIndex==-1){ lastIndex=firstIndex+fixed; } if(lastIndex"," "); ans=ans.replaceAll("(?is)", " "); ans=StringEscapeUtils.unescapeHtml4(ans); // to html elements such as  , &, etc return ans.replaceAll("\\<.*?>"," "); // remving other remaining tags if any. } public static void main(String[] args) throws ClassNotFoundException, IOException{ System.out.println("Please enter the currect url"); Scanner sc=new Scanner(System.in); String ip=sc.nextLine(); URL url=new URL(ip); //http://gumgum-public.s3.amazonaws.com/numbers.html String st=getText(url); String str=tagText(st); PrintWriter out = new PrintWriter(new FileWriter("Output.txt")); out.println(str); out.close(); } }