"""This file contains code for use with "Think Stats", by Allen B. Downey, available from greenteapress.com Copyright 2010 Allen B. Downey License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html """ from __future__ import print_function, division import sys import numpy as np import thinkstats2 from collections import defaultdict def ReadFemResp(dct_file='2002FemResp.dct', dat_file='2002FemResp.dat.gz', nrows=None): """Reads the NSFG respondent data. dct_file: string file name dat_file: string file name returns: DataFrame """ dct = thinkstats2.ReadStataDct(dct_file) df = dct.ReadFixedWidth(dat_file, compression='gzip', nrows=nrows) CleanFemResp(df) return df def CleanFemResp(df): """Recodes variables from the respondent frame. df: DataFrame """ pass def ReadFemPreg(dct_file='2002FemPreg.dct', dat_file='2002FemPreg.dat.gz'): """Reads the NSFG pregnancy data. dct_file: string file name dat_file: string file name returns: DataFrame """ dct = thinkstats2.ReadStataDct(dct_file) df = dct.ReadFixedWidth(dat_file, compression='gzip') CleanFemPreg(df) return df def CleanFemPreg(df): """Recodes variables from the pregnancy frame. df: DataFrame """ # mother's age is encoded in centiyears; convert to years df.agepreg /= 100.0 # birthwgt_lb contains at least one bogus value (51 lbs) # replace with NaN df.loc[df.birthwgt_lb > 20, 'birthwgt_lb'] = np.nan # replace 'not ascertained', 'refused', 'don't know' with NaN na_vals = [97, 98, 99] df.birthwgt_lb.replace(na_vals, np.nan, inplace=True) df.birthwgt_oz.replace(na_vals, np.nan, inplace=True) df.hpagelb.replace(na_vals, np.nan, inplace=True) df.babysex.replace([7, 9], np.nan, inplace=True) df.nbrnaliv.replace([9], np.nan, inplace=True) # birthweight is stored in two columns, lbs and oz. # convert to a single column in lb # NOTE: creating a new column requires dictionary syntax, # not attribute assignment (like df.totalwgt_lb) df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0 # due to a bug in ReadStataDct, the last variable gets clipped; # so for now set it to NaN df.cmintvw = np.nan def ValidatePregnum(resp, preg): """Validate pregnum in the respondent file. resp: respondent DataFrame preg: pregnancy DataFrame """ # make the map from caseid to list of pregnancy indices preg_map = MakePregMap(preg) # iterate through the respondent pregnum series for index, pregnum in resp.pregnum.iteritems(): caseid = resp.caseid[index] indices = preg_map[caseid] # check that pregnum from the respondent file equals # the number of records in the pregnancy file if len(indices) != pregnum: print(caseid, len(indices), pregnum) return False return True def MakePregMap(df): """Make a map from caseid to list of preg indices. df: DataFrame returns: dict that maps from caseid to list of indices into `preg` """ d = defaultdict(list) for index, caseid in df.caseid.iteritems(): d[caseid].append(index) return d def main(): """Tests the functions in this module. script: string script name """ # read and validate the respondent file resp = ReadFemResp() assert(len(resp) == 7643) assert(resp.pregnum.value_counts()[1] == 1267) # read and validate the pregnancy file preg = ReadFemPreg() print(preg.shape) assert len(preg) == 13593 assert preg.caseid[13592] == 12571 assert preg.pregordr.value_counts()[1] == 5033 assert preg.nbrnaliv.value_counts()[1] == 8981 assert preg.babysex.value_counts()[1] == 4641 assert preg.birthwgt_lb.value_counts()[7] == 3049 assert preg.birthwgt_oz.value_counts()[0] == 1037 assert preg.prglngth.value_counts()[39] == 4744 assert preg.outcome.value_counts()[1] == 9148 assert preg.birthord.value_counts()[1] == 4413 assert preg.agepreg.value_counts()[22.75] == 100 assert preg.totalwgt_lb.value_counts()[7.5] == 302 weights = preg.finalwgt.value_counts() key = max(weights.keys()) assert preg.finalwgt.value_counts()[key] == 6 # validate that the pregnum column in `resp` matches the number # of entries in `preg` assert(ValidatePregnum(resp, preg)) print('All tests passed.') if __name__ == '__main__': main()