1- #!/usr/bin/env python
2-
31import logging
4- import codecs
52import re
63import bz2
74
@@ -14,6 +11,7 @@ class Conllu(BaseReader):
1411 A reader of the Conll-u files.
1512
1613 """
14+
1715 def __init__ (self , args = None ):
1816 if args is None :
1917 args = {}
@@ -55,15 +53,14 @@ def __init__(self, args=None):
5553 # Use bz2 lib when bz2 file is given.
5654 if filename_extension == 'bz2' :
5755 logging .info ('Opening BZ2 file %s' , self .filename )
58- self .file_handler = bz2 .BZ2File (self .filename )
56+ self .file_handler = bz2 .open (
57+ self .filename , 'rt' , encoding = 'utf-8' )
5958 else :
6059 logging .info ('Opening regular file %s' , self .filename )
61- self .file_handler = open (self .filename , 'rb ' )
60+ self .file_handler = open (self .filename , 'rt' , encoding = 'utf-8 ' )
6261 else :
6362 raise ValueError ('No file to process' )
6463
65- self .file_handler = codecs .getreader ('utf8' )(self .file_handler )
66-
6764 # Remember total number of bundles
6865 self .total_number_of_bundles = 0
6966
@@ -92,7 +89,8 @@ def process_document(self, document):
9289 number_of_processed_bundles = - 1
9390 number_of_loaded_bundles = 0
9491
95- # Compile a set of regular expressions that will be searched over the lines.
92+ # Compile a set of regular expressions that will be searched over the
93+ # lines.
9694 re_comment_like = re .compile (r'^#' )
9795 re_sentence_id = re .compile (r'^# sent_id (\S+)' )
9896 re_multiword_tokens = re .compile (r'^\d+-' )
@@ -105,7 +103,8 @@ def process_document(self, document):
105103
106104 # If we can not add next bundle, return document.
107105 if number_of_loaded_bundles >= self .bundles_per_document :
108- logging .debug ('Reached number of requested bundles (%d)' , self .bundles_per_document )
106+ logging .debug (
107+ 'Reached number of requested bundles (%d)' , self .bundles_per_document )
109108 return document
110109
111110 # Obtain a raw bundle.
@@ -127,7 +126,8 @@ def process_document(self, document):
127126 raw_bundle_check = False
128127
129128 if not raw_bundle_check :
130- raise RuntimeError ('Detected an invalid bundle: %r' % raw_bundle )
129+ raise RuntimeError (
130+ 'Detected an invalid bundle: %r' % raw_bundle )
131131
132132 # Initialize the data structures.
133133 root_node = Root ()
@@ -144,7 +144,8 @@ def process_document(self, document):
144144 match = re_sentence_id .search (line )
145145 if match is not None :
146146 sent_id = match .group (1 )
147- logging .debug ('Matched sent_id keyword with value %s' , sent_id )
147+ logging .debug (
148+ 'Matched sent_id keyword with value %s' , sent_id )
148149 root_node .sent_id = sent_id
149150 continue
150151
@@ -159,15 +160,17 @@ def process_document(self, document):
159160 logging .debug ('Skipping multi-word tokens %s' , line )
160161 continue
161162
162- # Otherwise the line is a tab-separated list of node attributes.
163+ # Otherwise the line is a tab-separated list of node
164+ # attributes.
163165 node = root_node .create_child ()
164166 raw_node_attributes = line .split ('\t ' )
165167 for (n_attribute , attribute_name ) in enumerate (self .node_attributes ):
166168 if attribute_name == 'feats' :
167169 attribute_name = 'raw_feats'
168170 if attribute_name == 'deps' :
169171 attribute_name = 'raw_deps'
170- setattr (node , attribute_name , raw_node_attributes [n_attribute ])
172+ setattr (node , attribute_name ,
173+ raw_node_attributes [n_attribute ])
171174
172175 nodes .append (node )
173176
@@ -185,9 +188,11 @@ def process_document(self, document):
185188
186189 # At least one node should be parsed.
187190 if len (nodes ) == 0 :
188- raise ValueError ('Probably two empty lines following each other.' )
191+ raise ValueError (
192+ 'Probably two empty lines following each other.' )
189193
190- # If specified, check sentence ID to match the sentence ID filter.
194+ # If specified, check sentence ID to match the sentence ID
195+ # filter.
191196 if self .sentence_id_filter is not None :
192197 if self .sentence_id_filter .match (root_node .sent_id ) is None :
193198 logging .debug ('Skipping sentence %s as it does not match the sentence ID filter.' ,
0 commit comments