|
| 1 | +import pandas as pd |
| 2 | +import os |
| 3 | + |
| 4 | +def file_search(path = '.', doctype = 'csv', like = [''], strict = False): |
| 5 | + """ |
| 6 | + This function creates a list of all files of a certain type, satisfying the criteria outlined |
| 7 | + in like = [...] parameter. The function only searches for files in the specified folder |
| 8 | + of the current working directory that is set by the user. |
| 9 | + |
| 10 | + Parameters |
| 11 | + ----------- |
| 12 | + path : string |
| 13 | + Path to a folder in the current working directory |
| 14 | + default = '.', i.e. current working directory folder |
| 15 | + doctype : string |
| 16 | + Document format to search for |
| 17 | + e.g. 'csv' or 'xlsx' |
| 18 | + default = 'csv' |
| 19 | + like : list |
| 20 | + A list of words to filter the file search on |
| 21 | + default = [''], i.e. no filter |
| 22 | + strict : bool |
| 23 | + Set True to search for filenames containing all words from 'like' list ( |
| 24 | + default = False |
| 25 | + |
| 26 | + Returns |
| 27 | + ------- |
| 28 | + list |
| 29 | + |
| 30 | + Examples |
| 31 | + ------- |
| 32 | + >>> file_search(doctype = 'md') |
| 33 | + ['README.md', 'CONTRIBUTING.md'] |
| 34 | + |
| 35 | + >>> file_search(doctype = 'md', like = ['READ']) |
| 36 | + ['README.md'] |
| 37 | + |
| 38 | + """ |
| 39 | + |
| 40 | + if not isinstance(path, str): |
| 41 | + raise ValueError('Please input path as a string') |
| 42 | + elif not isinstance(doctype, str): |
| 43 | + raise ValueError('Please input doctype as a string') |
| 44 | + elif not isinstance(like, list): |
| 45 | + raise ValueError('Please input like as a list') |
| 46 | + elif not isinstance(strict, bool): |
| 47 | + raise ValueError('Please input strict as a bool') |
| 48 | + else: |
| 49 | + pass |
| 50 | + |
| 51 | + list_of_files = [] |
| 52 | + |
| 53 | + if strict == False: |
| 54 | + for file in os.listdir(path): |
| 55 | + if (file.split('.')[-1] == doctype) & (any(x in file for x in like)): |
| 56 | + list_of_files.append(file) |
| 57 | + else: |
| 58 | + for file in os.listdir(path): |
| 59 | + if (file.split('.')[-1] == doctype) & (all(x in file for x in like)): |
| 60 | + list_of_files.append(file) |
| 61 | + |
| 62 | + return list_of_files |
| 63 | + |
| 64 | + |
| 65 | + |
| 66 | +def import_files(path = '.', doctype = 'csv', sheet = 'Sheet1', subdir = False, like = [''], strict = False): |
| 67 | + """ |
| 68 | + This function imports all documents of a given format to a dictionary |
| 69 | + and returns this dictionary, keeping original file names. |
| 70 | +
|
| 71 | + Parameters |
| 72 | + ---------- |
| 73 | + path : string |
| 74 | + Path to a folder in the current working directory |
| 75 | + default = '.', i.e. current working directory folder |
| 76 | + doctype : string |
| 77 | + Document format to search for |
| 78 | + e.g. 'csv' or 'xlsx' |
| 79 | + default = 'csv' |
| 80 | + sheet : string |
| 81 | + Sheet name of the xlsx file |
| 82 | + default = 'Sheet1' |
| 83 | + subdir : bool |
| 84 | + True to allow download all files, including the subdirectories |
| 85 | + default = False |
| 86 | + like : list |
| 87 | + A list of words to filter the file search on |
| 88 | + default = [''], i.e. no filter |
| 89 | + strict : bool |
| 90 | + Set True to search for filenames containing all words from 'like' list |
| 91 | + default = False |
| 92 | +
|
| 93 | + Returns |
| 94 | + ------- |
| 95 | + out : dict |
| 96 | + |
| 97 | + Examples |
| 98 | + -------- |
| 99 | + |
| 100 | + '>>> import_files()' |
| 101 | +
|
| 102 | + File Data_AprF_2019 is successfully imported |
| 103 | + |
| 104 | + File Data_AugF_2019 is successfully imported |
| 105 | + |
| 106 | + File Data_JulF_2019 is successfully imported |
| 107 | + |
| 108 | + File Data_JunF_2019_v1 is successfully imported |
| 109 | + |
| 110 | + File Data_MayF_2019 is successfully imported |
| 111 | + |
| 112 | + File Data_SepP_2019 is successfully imported |
| 113 | + |
| 114 | + '>>> import_files(like = ['Aug','Sep'])' |
| 115 | +
|
| 116 | + File Data_AugF_2019 is successfully imported |
| 117 | + |
| 118 | + File Data_SepP_2019 is successfully imported |
| 119 | +
|
| 120 | + |
| 121 | + """ |
| 122 | + |
| 123 | + if not isinstance(path, str): |
| 124 | + raise ValueError('Please input path as a string') |
| 125 | + elif not isinstance(doctype, str): |
| 126 | + raise ValueError('Please input doctype as a string') |
| 127 | + elif not isinstance(sheet, str): |
| 128 | + raise ValueError('Please input sheet as a string') |
| 129 | + elif not isinstance(subdir, bool): |
| 130 | + raise ValueError('Please input subdir as a bool') |
| 131 | + elif not isinstance(like, list): |
| 132 | + raise ValueError('Please input like as a list') |
| 133 | + elif not isinstance(strict, bool): |
| 134 | + raise ValueError('Please input strict as a bool') |
| 135 | + else: |
| 136 | + pass |
| 137 | + |
| 138 | + |
| 139 | + dict_files = {} |
| 140 | + if subdir == True: |
| 141 | + |
| 142 | + for r, d, f in os.walk(path): |
| 143 | + for file in f: |
| 144 | + b = any(x in file for x in like) |
| 145 | + if strict == True: |
| 146 | + b = all(x in file for x in like) |
| 147 | + if (file.split('.')[-1] == doctype) & (b == True): |
| 148 | + k = file.strip('.' + doctype) |
| 149 | + try: |
| 150 | + name = os.path.join(r,file) |
| 151 | + print('\nImporting ' + k + '...', end = "", flush = True) |
| 152 | + if doctype == 'csv': |
| 153 | + dict_files[name.strip('.\\').strip('.csv')] = pd.read_csv(name) |
| 154 | + print('\rFile ' + k + ' is successfully imported') |
| 155 | + else: |
| 156 | + dict_files[name.strip('.\\').strip('.xlsx')] = pd.read_excel(name, sheet_name = sheet) |
| 157 | + print('\rFile ' + k + ' is successfully imported') |
| 158 | + except: |
| 159 | + print('Unable to read ' + k + ' file') |
| 160 | + else: |
| 161 | + for file in os.listdir(path): |
| 162 | + b = any(x in file for x in like) |
| 163 | + if strict == True: |
| 164 | + b = all(x in file for x in like) |
| 165 | + |
| 166 | + if (file.split('.')[-1] == doctype) & (b == True): |
| 167 | + k = file.strip('.' + doctype) |
| 168 | + try: |
| 169 | + name = os.path.join(path,file) |
| 170 | + print('\nImporting ' + k + '...', end = "", flush = True) |
| 171 | + if doctype == 'csv': |
| 172 | + dict_files[k] = pd.read_csv(name) |
| 173 | + print('\rFile ' + k + ' is successfully imported') |
| 174 | + else: |
| 175 | + dict_files[k] = pd.read_excel(name, sheet_name = sheet) |
| 176 | + print('\rFile ' + k + ' is successfully imported') |
| 177 | + except: |
| 178 | + print('Unable to read ' + k + ' file') |
| 179 | + |
| 180 | + return dict_files |
| 181 | + |
| 182 | +def compare(x, y, names = ['x','y'], dups = False, same = False, comment = False): |
| 183 | + """ |
| 184 | + This function returns a dictionary with: |
| 185 | + |
| 186 | + 1. Same values between data frames x and y |
| 187 | + 2. Values in x, not in y |
| 188 | + 3. Values in y, not in x |
| 189 | + |
| 190 | + (optional): |
| 191 | + (4) Duplicates of x |
| 192 | + (5) Duplicates of y |
| 193 | + (6) Boolean of whether x and y are the same |
| 194 | + |
| 195 | + Parameters |
| 196 | + ---------- |
| 197 | + x : pandas.DataFrame |
| 198 | + DataFrame #1 |
| 199 | + y : pandas.DataFrame |
| 200 | + DataFrame #2 |
| 201 | + names : list |
| 202 | + a list of user preferred file names |
| 203 | + e.g. ['File1', 'File2'] |
| 204 | + default = ['x','y'] |
| 205 | + dups : bool |
| 206 | + True to include duplicates check for each file |
| 207 | + default = False |
| 208 | + same : bool |
| 209 | + True to activate. Outputs True if DataFrames are the same |
| 210 | + default = False |
| 211 | + comment : bool |
| 212 | + True to activate. Prints out statistics of the compariosn results |
| 213 | + e.g. number of same valeus, number of duplicates, number of outliers and whether the DataFrames are the same |
| 214 | + default = False |
| 215 | + |
| 216 | + Returns |
| 217 | + ------- |
| 218 | + out : dict |
| 219 | +
|
| 220 | + Examples |
| 221 | + -------- |
| 222 | +
|
| 223 | + '>>> c = compare(df1, df2, names = ['df1','df2'], dups = True, same = True, comment =True)' |
| 224 | +
|
| 225 | + There are 133891 same values |
| 226 | + There are 16531 outliers in df1 |
| 227 | + There are 20937 outliers in df2 |
| 228 | + There are 48704 duplicates in df1 |
| 229 | + There are 0 duplicates in df2 |
| 230 | + The DataFrames are not the same |
| 231 | +
|
| 232 | + '>>> c = compare(df2, df2, names = ['df2','df2'], dups = True, same = True, comment =True)' |
| 233 | +
|
| 234 | + There are 154444 same values |
| 235 | + There are 0 outliers in df2 |
| 236 | + There are 0 outliers in df2 |
| 237 | + There are 0 duplicates in df2 |
| 238 | + There are 0 duplicates in df2 |
| 239 | + The DataFrames are the same |
| 240 | + """ |
| 241 | + |
| 242 | + if not isinstance(x, pd.DataFrame): |
| 243 | + raise ValueError('Please input x as a pandas.DataFrame') |
| 244 | + elif not isinstance(y, pd.DataFrame): |
| 245 | + raise ValueError('Please input y as a pandas.DataFrame') |
| 246 | + elif not isinstance(names, list): |
| 247 | + raise ValueError('Please input names as a list') |
| 248 | + elif not isinstance(dups, bool): |
| 249 | + raise ValueError('Please input dups as a bool') |
| 250 | + elif not isinstance(same, bool): |
| 251 | + raise ValueError('Please input same as a bool') |
| 252 | + elif not isinstance(comment, bool): |
| 253 | + raise ValueError('Please input comment as a bool') |
| 254 | + else: |
| 255 | + pass |
| 256 | + |
| 257 | + dict_temp = {} |
| 258 | + |
| 259 | + try: |
| 260 | + dict_temp['same_values'] = pd.merge(x.drop_duplicates(),y.drop_duplicates(), how = 'inner') |
| 261 | + except: |
| 262 | + print('Unable to identify same values') |
| 263 | + try: |
| 264 | + dict_temp[names[0] + '_not_' + names[1]] = pd.concat([x,dict_temp['same_values']], ignore_index = True).drop_duplicates(keep = False) |
| 265 | + dict_temp[names[1] + '_not_' + names[0]] = pd.concat([y,dict_temp['same_values']], ignore_index = True).drop_duplicates(keep = False) |
| 266 | + except: |
| 267 | + print('Unable to find outliers') |
| 268 | + |
| 269 | + if dups == True: |
| 270 | + try: |
| 271 | + dict_temp[names[0] + '_dups'] = x[x.duplicated() == True] |
| 272 | + dict_temp[names[1] + '_dups'] = y[y.duplicated() == True] |
| 273 | + except: |
| 274 | + print('Unable to find duplicates') |
| 275 | + if same == True: |
| 276 | + try: |
| 277 | + if (x.shape == y.shape) & (x.shape == dict_temp['same_values'].shape): |
| 278 | + dict_temp['Same'] = True |
| 279 | + else: |
| 280 | + dict_temp['Same'] = False |
| 281 | + except: |
| 282 | + print('Unable to determine whether the Dataframes are the same') |
| 283 | + try: |
| 284 | + if comment == True: |
| 285 | + print('\nThere are ' + str(dict_temp['same_values'].shape[0]) + ' same values') |
| 286 | + print('There are ' + str(dict_temp[names[0] + '_not_' + names[1]].shape[0]) + ' outliers in ' + str(names[0])) |
| 287 | + print('There are ' + str(dict_temp[names[1] + '_not_' + names[0]].shape[0]) + ' outliers in ' + str(names[1])) |
| 288 | + if dups == True: |
| 289 | + print('There are ' + str(dict_temp[names[0] + '_dups'].shape[0]) + ' duplicates in ' + names[0]) |
| 290 | + print('There are ' + str(dict_temp[names[1] + '_dups'].shape[0]) + ' duplicates in ' + names[1]) |
| 291 | + if same == True: |
| 292 | + if dict_temp['Same'] == True: |
| 293 | + s = 'the same' |
| 294 | + else: |
| 295 | + s = 'not the same' |
| 296 | + print('DataFrames are ' + s) |
| 297 | + except: |
| 298 | + print('Unable to print commentary') |
| 299 | + |
| 300 | + return dict_temp |
0 commit comments