|
| 1 | +package edu.illinois.cs.cogcomp.tutorial; |
| 2 | + |
| 3 | +import edu.illinois.cs.cogcomp.core.io.LineIO; |
| 4 | +import edu.illinois.cs.cogcomp.lbjava.parse.Parser; |
| 5 | +import edu.illinois.cs.cogcomp.tutorial.datastruct.ConllRawSentence; |
| 6 | +import edu.illinois.cs.cogcomp.tutorial.datastruct.ConllRawToken; |
| 7 | +import edu.illinois.cs.cogcomp.tutorial.datastruct.ConllRelation; |
| 8 | + |
| 9 | +import java.io.FileNotFoundException; |
| 10 | +import java.util.ArrayList; |
| 11 | +import java.util.List; |
| 12 | +import java.util.Vector; |
| 13 | + |
| 14 | +//import parse.Parser; |
| 15 | +//import jointentityrelationextraction. |
| 16 | +//import edu.illinois.cs.cogcomp.datastruct; |
| 17 | +//import edu.illinois.cs.cogcomp.datastruct.ConllRawSentence; |
| 18 | +//import edu.illinois.cs.cogcomp.datastruct.ConllRawToken; |
| 19 | +//import edu.illinois.cs.cogcomp.datastruct.ConllRelation; |
| 20 | + |
| 21 | +public class Conll04_InstanceReader implements Parser { |
| 22 | + public Vector<ConllRawToken> instances; |
| 23 | + public Vector<ConllRawSentence> sentences; |
| 24 | + public Vector<ConllRelation> relations; |
| 25 | + |
| 26 | + public String[] entityLabels,relLabels; |
| 27 | + private int currentInstanceId; |
| 28 | + |
| 29 | + |
| 30 | + public Conll04_InstanceReader(String filename){ |
| 31 | + instances=new Vector<ConllRawToken>(); |
| 32 | + relations=new Vector<ConllRelation>(); |
| 33 | + sentences=new Vector<ConllRawSentence>(); |
| 34 | + entityLabels=new String[0]; |
| 35 | + relLabels=new String[0]; |
| 36 | +// } |
| 37 | + |
| 38 | + |
| 39 | + //public void readData(String filename) throws Exception { |
| 40 | + //BufferedReader br=new BufferedReader(new FileReader(filename)); |
| 41 | + List<String> lines = null; |
| 42 | + try { |
| 43 | + lines = LineIO.read(filename); |
| 44 | + } catch (FileNotFoundException e) { |
| 45 | + // TODO Auto-generated catch block |
| 46 | + e.printStackTrace(); |
| 47 | + } |
| 48 | + String line; |
| 49 | + String[] tokens; |
| 50 | + |
| 51 | + |
| 52 | + ConllRawToken c=new ConllRawToken(); |
| 53 | + |
| 54 | + ConllRelation r; |
| 55 | + int currSentId=0; |
| 56 | + boolean sentEnd=false; |
| 57 | + ConllRawSentence sent=new ConllRawSentence(currSentId); |
| 58 | + |
| 59 | + ArrayList<String> entityal = new ArrayList<String>(); |
| 60 | + ArrayList<String> relal = new ArrayList<String>(); |
| 61 | + |
| 62 | + boolean relationSeen=false; |
| 63 | + int sentindex = 0; |
| 64 | + while(sentindex < lines.size()){ |
| 65 | + line = lines.get(sentindex); |
| 66 | + sentindex ++; |
| 67 | + |
| 68 | + //System.out.println(sentindex + " " + line); |
| 69 | + if(line.isEmpty()){ |
| 70 | + sentEnd=true; |
| 71 | + |
| 72 | +/* if(!sentEnd){ |
| 73 | + currSentId++; |
| 74 | + sentEnd=true; |
| 75 | + |
| 76 | + sentences.add(sent); |
| 77 | + |
| 78 | + sent=new ConllRawSentence(currSentId); |
| 79 | + }*/ |
| 80 | + continue; |
| 81 | + } |
| 82 | + |
| 83 | + tokens=line.split("\t|\n"); |
| 84 | + int s=tokens.length; |
| 85 | + if(s==3){ |
| 86 | + relationSeen=true; |
| 87 | + r=new ConllRelation(); |
| 88 | +// r.sentId1=currSentId-1; |
| 89 | +// r.sentId2=currSentId-1; |
| 90 | + r.wordId1=Integer.parseInt(tokens[0]); |
| 91 | + r.wordId2=Integer.parseInt(tokens[1]); |
| 92 | + r.relType=tokens[2]; |
| 93 | + relations.add(r); |
| 94 | + sent.addRelations(r); |
| 95 | +// sentences.elementAt(sentences.size()-1).addRelations(r); |
| 96 | + if(!relal.contains(tokens[2])){ |
| 97 | + relal.add(tokens[2]); |
| 98 | + } |
| 99 | + } |
| 100 | + else{ |
| 101 | + //System.out.println("tokens[1]="+tokens[1]+"done"); |
| 102 | + if(sentEnd){ |
| 103 | + //if(!relationSeen) |
| 104 | + { |
| 105 | + sentences.add(sent); |
| 106 | +/* if(currSentId < 700) |
| 107 | + System.out.println("sid:" + currSentId); |
| 108 | + else System.out.println("sid:" + (currSentId + 51)); |
| 109 | + for(int ind = 0;ind < sent.sentTokens.size();ind ++) |
| 110 | + System.out.print(sent.sentTokens.get(ind).phrase + " "); |
| 111 | + System.out.println(); |
| 112 | + */ |
| 113 | + currSentId++; |
| 114 | + } |
| 115 | + sent=new ConllRawSentence(currSentId); |
| 116 | + } |
| 117 | + |
| 118 | + c=new ConllRawToken(); |
| 119 | + |
| 120 | +/* if(currSentId < 700) |
| 121 | + assert (currSentId == Integer.parseInt(tokens[0])); |
| 122 | + else |
| 123 | + { |
| 124 | + assert(currSentId == Integer.parseInt(tokens[0]) - 51); |
| 125 | + if(currSentId != Integer.parseInt(tokens[0]) - 51) |
| 126 | + System.out.println("fuck you here"); |
| 127 | + }*/ |
| 128 | + |
| 129 | + c.entType=tokens[1]; |
| 130 | + c.sentId=currSentId; |
| 131 | + c.wordId=Integer.parseInt(tokens[2]); |
| 132 | + c.setPOS(tokens[4]); |
| 133 | + c.setPhrase(tokens[5]); |
| 134 | + |
| 135 | + sent.addTokens(c); |
| 136 | + if(!tokens[1].trim().equals("O")){ |
| 137 | + instances.add(c); |
| 138 | + sent.setCurrentTokenAsEntity(); |
| 139 | + if(!entityal.contains(tokens[1])){ |
| 140 | + entityal.add(tokens[1]); |
| 141 | + } |
| 142 | + } |
| 143 | + |
| 144 | + sentEnd=false; |
| 145 | + relationSeen=false; |
| 146 | + } |
| 147 | + } |
| 148 | + |
| 149 | + entityLabels=entityal.toArray(entityLabels); |
| 150 | + relLabels=relal.toArray(relLabels); |
| 151 | + |
| 152 | + } |
| 153 | + |
| 154 | + |
| 155 | + public void printData(){ |
| 156 | + System.out.println("printing total "+sentences.size()+" sentences"); |
| 157 | + for(int i=0;i<sentences.size();i++){ |
| 158 | +// sentences.elementAt(i).printSentence(); |
| 159 | + sentences.elementAt(i).printEntities(); |
| 160 | + sentences.elementAt(i).printRelations(); |
| 161 | + } |
| 162 | + System.out.println("printing total "+instances.size()+" instances"); |
| 163 | + for(int i=0;i<instances.size();i++){ |
| 164 | + instances.elementAt(i).printInstance(); |
| 165 | + } |
| 166 | + System.out.println("printing total "+relations.size()+ " relations"); |
| 167 | + for(int i=0;i<relations.size();i++){ |
| 168 | + relations.elementAt(i).printRelation(); |
| 169 | + } |
| 170 | + } |
| 171 | +// public static void main(String[] args) throws Exception{ |
| 172 | +// System.out.println("here"); |
| 173 | +// Conll04_InstanceReader cr=new Conll04_InstanceReader("./data/conll04.corp"); |
| 174 | +// //cr.readData("./data/conll04.corp"); |
| 175 | +// cr.printData(); |
| 176 | +// } |
| 177 | + public void close() { |
| 178 | + } |
| 179 | + public Object next() { |
| 180 | + |
| 181 | + if (currentInstanceId < instances.size()) { |
| 182 | + |
| 183 | + ConllRawToken file = instances.get(currentInstanceId++); |
| 184 | + |
| 185 | +// String[] split = file.getPath().split(File.separator); |
| 186 | + |
| 187 | + // String label = split[split.length - 2]; |
| 188 | + |
| 189 | + return file;//Document(file, label); |
| 190 | + } else |
| 191 | + return null; |
| 192 | + } |
| 193 | + |
| 194 | + public void reset() { |
| 195 | + currentInstanceId = 0; |
| 196 | + } |
| 197 | + |
| 198 | + public static void main(String[] args) throws Exception{ |
| 199 | + System.out.println("here"); |
| 200 | + Conll04_InstanceReader cr=new Conll04_InstanceReader("/Users/parisakordjamshidi/wolfe-0.1.0/LBJ/data/conll04.corp"); |
| 201 | + |
| 202 | + //cr.readData("/home/roth/rsamdan2/Project/EMStructuredPrediction/UnsupRelationExtraction/data/conll04.corp"); |
| 203 | + cr.printData(); |
| 204 | + |
| 205 | + } |
| 206 | + |
| 207 | +} |
0 commit comments