Skip to content

Commit 8045911

Browse files
RomanPenzamurd0
authored andcommitted
File utils (#104)
* Add file_utils module
1 parent 361e646 commit 8045911

2 files changed

Lines changed: 660 additions & 0 deletions

File tree

codonPython/file_utils.py

Lines changed: 300 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,300 @@
1+
import pandas as pd
2+
import os
3+
4+
def file_search(path = '.', doctype = 'csv', like = [''], strict = False):
5+
"""
6+
This function creates a list of all files of a certain type, satisfying the criteria outlined
7+
in like = [...] parameter. The function only searches for files in the specified folder
8+
of the current working directory that is set by the user.
9+
10+
Parameters
11+
-----------
12+
path : string
13+
Path to a folder in the current working directory
14+
default = '.', i.e. current working directory folder
15+
doctype : string
16+
Document format to search for
17+
e.g. 'csv' or 'xlsx'
18+
default = 'csv'
19+
like : list
20+
A list of words to filter the file search on
21+
default = [''], i.e. no filter
22+
strict : bool
23+
Set True to search for filenames containing all words from 'like' list (
24+
default = False
25+
26+
Returns
27+
-------
28+
list
29+
30+
Examples
31+
-------
32+
>>> file_search(doctype = 'md')
33+
['README.md', 'CONTRIBUTING.md']
34+
35+
>>> file_search(doctype = 'md', like = ['READ'])
36+
['README.md']
37+
38+
"""
39+
40+
if not isinstance(path, str):
41+
raise ValueError('Please input path as a string')
42+
elif not isinstance(doctype, str):
43+
raise ValueError('Please input doctype as a string')
44+
elif not isinstance(like, list):
45+
raise ValueError('Please input like as a list')
46+
elif not isinstance(strict, bool):
47+
raise ValueError('Please input strict as a bool')
48+
else:
49+
pass
50+
51+
list_of_files = []
52+
53+
if strict == False:
54+
for file in os.listdir(path):
55+
if (file.split('.')[-1] == doctype) & (any(x in file for x in like)):
56+
list_of_files.append(file)
57+
else:
58+
for file in os.listdir(path):
59+
if (file.split('.')[-1] == doctype) & (all(x in file for x in like)):
60+
list_of_files.append(file)
61+
62+
return list_of_files
63+
64+
65+
66+
def import_files(path = '.', doctype = 'csv', sheet = 'Sheet1', subdir = False, like = [''], strict = False):
67+
"""
68+
This function imports all documents of a given format to a dictionary
69+
and returns this dictionary, keeping original file names.
70+
71+
Parameters
72+
----------
73+
path : string
74+
Path to a folder in the current working directory
75+
default = '.', i.e. current working directory folder
76+
doctype : string
77+
Document format to search for
78+
e.g. 'csv' or 'xlsx'
79+
default = 'csv'
80+
sheet : string
81+
Sheet name of the xlsx file
82+
default = 'Sheet1'
83+
subdir : bool
84+
True to allow download all files, including the subdirectories
85+
default = False
86+
like : list
87+
A list of words to filter the file search on
88+
default = [''], i.e. no filter
89+
strict : bool
90+
Set True to search for filenames containing all words from 'like' list
91+
default = False
92+
93+
Returns
94+
-------
95+
out : dict
96+
97+
Examples
98+
--------
99+
100+
'>>> import_files()'
101+
102+
File Data_AprF_2019 is successfully imported
103+
104+
File Data_AugF_2019 is successfully imported
105+
106+
File Data_JulF_2019 is successfully imported
107+
108+
File Data_JunF_2019_v1 is successfully imported
109+
110+
File Data_MayF_2019 is successfully imported
111+
112+
File Data_SepP_2019 is successfully imported
113+
114+
'>>> import_files(like = ['Aug','Sep'])'
115+
116+
File Data_AugF_2019 is successfully imported
117+
118+
File Data_SepP_2019 is successfully imported
119+
120+
121+
"""
122+
123+
if not isinstance(path, str):
124+
raise ValueError('Please input path as a string')
125+
elif not isinstance(doctype, str):
126+
raise ValueError('Please input doctype as a string')
127+
elif not isinstance(sheet, str):
128+
raise ValueError('Please input sheet as a string')
129+
elif not isinstance(subdir, bool):
130+
raise ValueError('Please input subdir as a bool')
131+
elif not isinstance(like, list):
132+
raise ValueError('Please input like as a list')
133+
elif not isinstance(strict, bool):
134+
raise ValueError('Please input strict as a bool')
135+
else:
136+
pass
137+
138+
139+
dict_files = {}
140+
if subdir == True:
141+
142+
for r, d, f in os.walk(path):
143+
for file in f:
144+
b = any(x in file for x in like)
145+
if strict == True:
146+
b = all(x in file for x in like)
147+
if (file.split('.')[-1] == doctype) & (b == True):
148+
k = file.strip('.' + doctype)
149+
try:
150+
name = os.path.join(r,file)
151+
print('\nImporting ' + k + '...', end = "", flush = True)
152+
if doctype == 'csv':
153+
dict_files[name.strip('.\\').strip('.csv')] = pd.read_csv(name)
154+
print('\rFile ' + k + ' is successfully imported')
155+
else:
156+
dict_files[name.strip('.\\').strip('.xlsx')] = pd.read_excel(name, sheet_name = sheet)
157+
print('\rFile ' + k + ' is successfully imported')
158+
except:
159+
print('Unable to read ' + k + ' file')
160+
else:
161+
for file in os.listdir(path):
162+
b = any(x in file for x in like)
163+
if strict == True:
164+
b = all(x in file for x in like)
165+
166+
if (file.split('.')[-1] == doctype) & (b == True):
167+
k = file.strip('.' + doctype)
168+
try:
169+
name = os.path.join(path,file)
170+
print('\nImporting ' + k + '...', end = "", flush = True)
171+
if doctype == 'csv':
172+
dict_files[k] = pd.read_csv(name)
173+
print('\rFile ' + k + ' is successfully imported')
174+
else:
175+
dict_files[k] = pd.read_excel(name, sheet_name = sheet)
176+
print('\rFile ' + k + ' is successfully imported')
177+
except:
178+
print('Unable to read ' + k + ' file')
179+
180+
return dict_files
181+
182+
def compare(x, y, names = ['x','y'], dups = False, same = False, comment = False):
183+
"""
184+
This function returns a dictionary with:
185+
186+
1. Same values between data frames x and y
187+
2. Values in x, not in y
188+
3. Values in y, not in x
189+
190+
(optional):
191+
(4) Duplicates of x
192+
(5) Duplicates of y
193+
(6) Boolean of whether x and y are the same
194+
195+
Parameters
196+
----------
197+
x : pandas.DataFrame
198+
DataFrame #1
199+
y : pandas.DataFrame
200+
DataFrame #2
201+
names : list
202+
a list of user preferred file names
203+
e.g. ['File1', 'File2']
204+
default = ['x','y']
205+
dups : bool
206+
True to include duplicates check for each file
207+
default = False
208+
same : bool
209+
True to activate. Outputs True if DataFrames are the same
210+
default = False
211+
comment : bool
212+
True to activate. Prints out statistics of the compariosn results
213+
e.g. number of same valeus, number of duplicates, number of outliers and whether the DataFrames are the same
214+
default = False
215+
216+
Returns
217+
-------
218+
out : dict
219+
220+
Examples
221+
--------
222+
223+
'>>> c = compare(df1, df2, names = ['df1','df2'], dups = True, same = True, comment =True)'
224+
225+
There are 133891 same values
226+
There are 16531 outliers in df1
227+
There are 20937 outliers in df2
228+
There are 48704 duplicates in df1
229+
There are 0 duplicates in df2
230+
The DataFrames are not the same
231+
232+
'>>> c = compare(df2, df2, names = ['df2','df2'], dups = True, same = True, comment =True)'
233+
234+
There are 154444 same values
235+
There are 0 outliers in df2
236+
There are 0 outliers in df2
237+
There are 0 duplicates in df2
238+
There are 0 duplicates in df2
239+
The DataFrames are the same
240+
"""
241+
242+
if not isinstance(x, pd.DataFrame):
243+
raise ValueError('Please input x as a pandas.DataFrame')
244+
elif not isinstance(y, pd.DataFrame):
245+
raise ValueError('Please input y as a pandas.DataFrame')
246+
elif not isinstance(names, list):
247+
raise ValueError('Please input names as a list')
248+
elif not isinstance(dups, bool):
249+
raise ValueError('Please input dups as a bool')
250+
elif not isinstance(same, bool):
251+
raise ValueError('Please input same as a bool')
252+
elif not isinstance(comment, bool):
253+
raise ValueError('Please input comment as a bool')
254+
else:
255+
pass
256+
257+
dict_temp = {}
258+
259+
try:
260+
dict_temp['same_values'] = pd.merge(x.drop_duplicates(),y.drop_duplicates(), how = 'inner')
261+
except:
262+
print('Unable to identify same values')
263+
try:
264+
dict_temp[names[0] + '_not_' + names[1]] = pd.concat([x,dict_temp['same_values']], ignore_index = True).drop_duplicates(keep = False)
265+
dict_temp[names[1] + '_not_' + names[0]] = pd.concat([y,dict_temp['same_values']], ignore_index = True).drop_duplicates(keep = False)
266+
except:
267+
print('Unable to find outliers')
268+
269+
if dups == True:
270+
try:
271+
dict_temp[names[0] + '_dups'] = x[x.duplicated() == True]
272+
dict_temp[names[1] + '_dups'] = y[y.duplicated() == True]
273+
except:
274+
print('Unable to find duplicates')
275+
if same == True:
276+
try:
277+
if (x.shape == y.shape) & (x.shape == dict_temp['same_values'].shape):
278+
dict_temp['Same'] = True
279+
else:
280+
dict_temp['Same'] = False
281+
except:
282+
print('Unable to determine whether the Dataframes are the same')
283+
try:
284+
if comment == True:
285+
print('\nThere are ' + str(dict_temp['same_values'].shape[0]) + ' same values')
286+
print('There are ' + str(dict_temp[names[0] + '_not_' + names[1]].shape[0]) + ' outliers in ' + str(names[0]))
287+
print('There are ' + str(dict_temp[names[1] + '_not_' + names[0]].shape[0]) + ' outliers in ' + str(names[1]))
288+
if dups == True:
289+
print('There are ' + str(dict_temp[names[0] + '_dups'].shape[0]) + ' duplicates in ' + names[0])
290+
print('There are ' + str(dict_temp[names[1] + '_dups'].shape[0]) + ' duplicates in ' + names[1])
291+
if same == True:
292+
if dict_temp['Same'] == True:
293+
s = 'the same'
294+
else:
295+
s = 'not the same'
296+
print('DataFrames are ' + s)
297+
except:
298+
print('Unable to print commentary')
299+
300+
return dict_temp

0 commit comments

Comments
 (0)