Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
| Download

📚 The CoCalc Library - books, templates and other resources

Views: 96160
License: OTHER
1
import pandas as pd
2
import numpy as np
3
import matplotlib.pyplot as plt
4
5
import re
6
7
class FixedWidthVariables(object):
8
"""Represents a set of variables in a fixed width file."""
9
10
def __init__(self, variables, index_base=0):
11
"""Initializes.
12
13
variables: DataFrame
14
index_base: are the indices 0 or 1 based?
15
16
Attributes:
17
colspecs: list of (start, end) index tuples
18
names: list of string variable names
19
"""
20
self.variables = variables
21
22
# note: by default, subtract 1 from colspecs
23
self.colspecs = variables[['start', 'end']] - index_base
24
25
# convert colspecs to a list of pair of int
26
self.colspecs = self.colspecs.astype(np.int).values.tolist()
27
self.names = variables['name']
28
29
def read_fixed_width(self, filename, **options):
30
"""Reads a fixed width ASCII file.
31
32
filename: string filename
33
34
returns: DataFrame
35
"""
36
df = pd.read_fwf(filename,
37
colspecs=self.colspecs,
38
names=self.names,
39
**options)
40
return df
41
42
43
def read_stata_dict(dct_file, **options):
44
"""Reads a Stata dictionary file.
45
46
dct_file: string filename
47
options: dict of options passed to open()
48
49
returns: FixedWidthVariables object
50
"""
51
type_map = dict(byte=int, int=int, long=int, float=float,
52
double=float, numeric=float)
53
54
var_info = []
55
with open(dct_file, **options) as f:
56
for line in f:
57
match = re.search( r'_column\(([^)]*)\)', line)
58
if not match:
59
continue
60
start = int(match.group(1))
61
t = line.split()
62
vtype, name, fstring = t[1:4]
63
name = name.lower()
64
if vtype.startswith('str'):
65
vtype = str
66
else:
67
vtype = type_map[vtype]
68
long_desc = ' '.join(t[4:]).strip('"')
69
var_info.append((start, vtype, name, fstring, long_desc))
70
71
columns = ['start', 'type', 'name', 'fstring', 'desc']
72
variables = pd.DataFrame(var_info, columns=columns)
73
74
# fill in the end column by shifting the start column
75
variables['end'] = variables.start.shift(-1)
76
variables.loc[len(variables)-1, 'end'] = 0
77
78
dct = FixedWidthVariables(variables, index_base=1)
79
return dct
80
81
82
def read_stata(dct_name, dat_name, **options):
83
"""Reads Stata files from the given directory.
84
85
dirname: string
86
87
returns: DataFrame
88
"""
89
dct = read_stata_dict(dct_name)
90
df = dct.read_fixed_width(dat_name, **options)
91
return df
92
93
94
def sample_rows(df, nrows, replace=False):
95
"""Choose a sample of rows from a DataFrame.
96
97
df: DataFrame
98
nrows: number of rows
99
replace: whether to sample with replacement
100
101
returns: DataDf
102
"""
103
indices = np.random.choice(df.index, nrows, replace=replace)
104
sample = df.loc[indices]
105
return sample
106
107
108
def resample_rows(df):
109
"""Resamples rows from a DataFrame.
110
111
df: DataFrame
112
113
returns: DataFrame
114
"""
115
return sample_rows(df, len(df), replace=True)
116
117
118
def resample_rows_weighted(df, column='finalwgt'):
119
"""Resamples a DataFrame using probabilities proportional to given column.
120
121
df: DataFrame
122
column: string column name to use as weights
123
124
returns: DataFrame
125
"""
126
weights = df[column].copy()
127
weights /= sum(weights)
128
indices = np.random.choice(df.index, len(df), replace=True, p=weights)
129
sample = df.loc[indices]
130
return sample
131
132
133
def resample_by_year(df, column='wtssall'):
134
"""Resample rows within each year.
135
136
df: DataFrame
137
column: string name of weight variable
138
139
returns DataFrame
140
"""
141
grouped = df.groupby('year')
142
samples = [resample_rows_weighted(group, column)
143
for _, group in grouped]
144
sample = pd.concat(samples, ignore_index=True)
145
return sample
146
147
148
def values(df, varname):
149
"""Values and counts in index order.
150
151
df: DataFrame
152
varname: strign column name
153
154
returns: Series that maps from value to frequency
155
"""
156
return df[varname].value_counts().sort_index()
157
158
159
def fill_missing(df, varname, badvals=[98, 99]):
160
"""Fill missing data with random values.
161
162
df: DataFrame
163
varname: string column name
164
badvals: list of values to be replaced
165
"""
166
# replace badvals with NaN
167
df[varname].replace(badvals, np.nan, inplace=True)
168
169
# get the index of rows missing varname
170
null = df[varname].isnull()
171
n_missing = sum(null)
172
173
# choose a random sample from the non-missing values
174
fill = np.random.choice(df[varname].dropna(), n_missing, replace=True)
175
176
# replace missing data with the samples
177
df.loc[null, varname] = fill
178
179
# return the number of missing values replaced
180
return n_missing
181
182
183
def round_into_bins(df, var, bin_width, high=None, low=0):
184
"""Rounds values down to the bin they belong in.
185
186
df: DataFrame
187
var: string variable name
188
bin_width: number, width of the bins
189
190
returns: array of bin values
191
"""
192
if high is None:
193
high = df[var].max()
194
195
bins = np.arange(low, high+bin_width, bin_width)
196
indices = np.digitize(df[var], bins)
197
return bins[indices-1]
198
199
200
def underride(d, **options):
201
"""Add key-value pairs to d only if key is not in d.
202
203
d: dictionary
204
options: keyword args to add to d
205
"""
206
for key, val in options.items():
207
d.setdefault(key, val)
208
209
return d
210
211
212
def decorate(**options):
213
"""Decorate the current axes.
214
Call decorate with keyword arguments like
215
decorate(title='Title',
216
xlabel='x',
217
ylabel='y')
218
The keyword arguments can be any of the axis properties
219
https://matplotlib.org/api/axes_api.html
220
In addition, you can use `legend=False` to suppress the legend.
221
And you can use `loc` to indicate the location of the legend
222
(the default value is 'best')
223
"""
224
loc = options.pop('loc', 'best')
225
if options.pop('legend', True):
226
legend(loc=loc)
227
228
plt.gca().set(**options)
229
plt.tight_layout()
230
231
232
def legend(**options):
233
"""Draws a legend only if there is at least one labeled item.
234
options are passed to plt.legend()
235
https://matplotlib.org/api/_as_gen/matplotlib.pyplot.legend.html
236
"""
237
underride(options, loc='best')
238
239
ax = plt.gca()
240
handles, labels = ax.get_legend_handles_labels()
241
#TODO: don't draw if there are none
242
ax.legend(handles, labels, **options)
243
244