# -*- coding: utf-8 -*-
"""
Created on Sun Aug 28 21:38:12 2016
@author: User
"""
#importing libraries
import pandas
import numpy
#This reads the data into a handy dataframe format.
mydata=pandas.read_csv('addhealth_pds.csv',low_memory=False)
#setting variables you will be working with to numeric
mydata['H1WP13'] = mydata['H1WP13'].convert_objects(convert_numeric=True)
mydata['H1WP18B'] = mydata['H1WP18B'].convert_objects(convert_numeric=True)
mydata['H1WP18B'] = mydata['H1WP18A'].convert_objects(convert_numeric=True)
#counts the original values for how close do you feel to your father
print ('counts for original H1WP13')
c1 = mydata['H1WP13'].value_counts(sort=False, dropna=False)
print(c1)
print('percentage %')
p1 = mydata['H1WP13'].value_counts(sort=False, normalize=True)
print (p1)
# recode missing values to python missing (NaN)
mydata['H1WP13']=mydata['H1WP13'].replace(8, numpy.nan)
# ask for a second frequency distribution
print ('counts for missing data how close do you fell your father')
c2 = mydata['H1WP13'].value_counts(sort=False, dropna=False)
print(c2)
#I do the same for the variable H1WP18B
print('counts for original H1WP18B')
c3 = mydata['H1WP18B'].value_counts(sort=False, dropna=False)
print(c3)
# recode missing values to python missing (NaN)-refused
mydata['H1WP18B']=mydata['H1WP18B'].replace(6, numpy.nan)
print ('counts for missing data how many played a sport with their father in the past 4 weeks')
c4 = mydata['H1WP18B'].value_counts(sort=False, dropna=False)
print(c4)
#coding in valid data
#recode missing values to numeric value, in this example replace NaN with 11
mydata['H1WP13'].fillna(11, inplace=True)
#recode 99 values as missing
mydata['H1WP13']=mydata['H1WP13'].replace(8, numpy.nan)
print ('H1WP13 with Blanks recoded as 11 and 8 set to NAN')
# check coding
chk2 = mydata['H1WP13'].value_counts(sort=False, dropna=False)
print(chk2)
ds2= mydata["H1WP13"].describe()
print(ds2)
#secondary variable
mydata['H1WP18_both']=mydata['H1WP18B'] + mydata['H1WP18A']
# subset variables in new data frame, sub1
sub1=mydata[['H1WP18B','H1WP18A', 'H1WP18_both']]
a = sub1.head (n=10)
print(a)
#-----------------------------------------------------------
print ('4 categories')
mydata['H1WP13_NEW']=pandas.qcut(mydata.H1WP13, 4, labels=["1=0%tile","2=25%tile","3=50%tile","4=75%tile"])
c4 = mydata['H1WP13_NEW'].value_counts(sort=False, dropna=True)
print(c4)
#crosstabs evaluating which ages were put into which AGEGROUP3
print (pandas.crosstab(mydata['H1WP13_NEW'], mydata['H1WP13']))
"""
Created on Sun Aug 28 21:38:12 2016
@author: User
"""
#importing libraries
import pandas
import numpy
#This reads the data into a handy dataframe format.
mydata=pandas.read_csv('addhealth_pds.csv',low_memory=False)
#setting variables you will be working with to numeric
mydata['H1WP13'] = mydata['H1WP13'].convert_objects(convert_numeric=True)
mydata['H1WP18B'] = mydata['H1WP18B'].convert_objects(convert_numeric=True)
mydata['H1WP18B'] = mydata['H1WP18A'].convert_objects(convert_numeric=True)
#counts the original values for how close do you feel to your father
print ('counts for original H1WP13')
c1 = mydata['H1WP13'].value_counts(sort=False, dropna=False)
print(c1)
print('percentage %')
p1 = mydata['H1WP13'].value_counts(sort=False, normalize=True)
print (p1)
# recode missing values to python missing (NaN)
mydata['H1WP13']=mydata['H1WP13'].replace(8, numpy.nan)
# ask for a second frequency distribution
print ('counts for missing data how close do you fell your father')
c2 = mydata['H1WP13'].value_counts(sort=False, dropna=False)
print(c2)
#I do the same for the variable H1WP18B
print('counts for original H1WP18B')
c3 = mydata['H1WP18B'].value_counts(sort=False, dropna=False)
print(c3)
# recode missing values to python missing (NaN)-refused
mydata['H1WP18B']=mydata['H1WP18B'].replace(6, numpy.nan)
print ('counts for missing data how many played a sport with their father in the past 4 weeks')
c4 = mydata['H1WP18B'].value_counts(sort=False, dropna=False)
print(c4)
#coding in valid data
#recode missing values to numeric value, in this example replace NaN with 11
mydata['H1WP13'].fillna(11, inplace=True)
#recode 99 values as missing
mydata['H1WP13']=mydata['H1WP13'].replace(8, numpy.nan)
print ('H1WP13 with Blanks recoded as 11 and 8 set to NAN')
# check coding
chk2 = mydata['H1WP13'].value_counts(sort=False, dropna=False)
print(chk2)
ds2= mydata["H1WP13"].describe()
print(ds2)
#secondary variable
mydata['H1WP18_both']=mydata['H1WP18B'] + mydata['H1WP18A']
# subset variables in new data frame, sub1
sub1=mydata[['H1WP18B','H1WP18A', 'H1WP18_both']]
a = sub1.head (n=10)
print(a)
#-----------------------------------------------------------
print ('4 categories')
mydata['H1WP13_NEW']=pandas.qcut(mydata.H1WP13, 4, labels=["1=0%tile","2=25%tile","3=50%tile","4=75%tile"])
c4 = mydata['H1WP13_NEW'].value_counts(sort=False, dropna=True)
print(c4)
#crosstabs evaluating which ages were put into which AGEGROUP3
print (pandas.crosstab(mydata['H1WP13_NEW'], mydata['H1WP13']))
Δεν υπάρχουν σχόλια:
Δημοσίευση σχολίου