import pandas as pd df = pd.read_csv('trainingset.csv', sep=',', low_memory = False).head(200000) df = df.rename(columns={'bbl_id': 'building_id', "bldg_ctgy": "building_category", "tax_cls_p": "tax_class_present", "bldg_cls_p": "building_class_present", "res_unit": "residential_unit", "com_unit" : "community_unit", "tot_unit" : "total_units", "yr_built" : "year_built", "tax_cls_s": "tax_class_sale", "bldg_cls_s" : "building_class_sale"}) print(df.columns) id = df[['building_id', 'price']] df = df[df['price']>75000]
#sales_zip_dummies = significant_dummies(df['zip'],threshold=.01) #sales_cat_dummies = significant_dummies(df['building_category'],threshold=.01) # ============================================================================= # zip_dummies = pd.get_dummies(df["zip"], prefix=df["zip"].name) # df = df.drop(columns="zip") # df_zip = pd.concat([id, zip_dummies], axis=1) # #df_zip.to_csv('zips2.csv') # building_category_dummies = pd.get_dummies(df["building_category"], prefix=df["building_category"].name) # df = df.drop(columns="building_category") # df_build_cat = pd.concat([id, building_category_dummies], axis=1) # ============================================================================= import re df['address'] = df['address'].astype(str) df['address'] = df['address'].apply(lambda x: re.sub(r'(^[0-9]+\s)|N/A','', str(x))) address_dummies = pd.get_dummies(df["address"], prefix=df["address"].name) df = df.drop(columns="address") df_address = pd.concat([id, address_dummies], axis=1) print('YAY') #df_address.to_csv('addresses.csv') #sales_address_dummies = significant_dummies(df['address'],threshold=.01)
print(df[df['borough']==4]) df['borough'] = df["borough"].replace([1,2,3,4,5], ["Manhattan", "Bronx", 'Brooklyn','Queens','Staten Island']) #print(df[df['borough'] == 'Queens']) borough_dummies = pd.get_dummies(df["borough"], prefix=df["borough"].name) df_boroughs = pd.concat([id, borough_dummies], axis=1) #df_boroughs.to_csv('boroughs3.csv')
# ============================================================================= # import matplotlib.pyplot as plt # dfborough = df.groupby('borough').mean().reset_index() # x = dfborough['borough'] # y = dfborough['price'] # plt.bar(x,y) # plt.show() # print('ok') # dfborough = df.groupby('borough').mean().reset_index() # x = dfborough['borough'] # y = dfborough['price'] # plt.bar(x,y) # plt.show() # print('ok') # # ============================================================================= #sales_bor_dummies = significant_dummies(df['borough'],threshold=.01) borough_dummies = pd.get_dummies(df["borough"], prefix=df["borough"].name) df = df.drop(columns="borough") df = pd.concat([df, borough_dummies], axis=1) import pandas as pd import numpy as np #df = pd.read_csv('addresses.csv') print(df.columns)
df2 = df_address print(df2.tail) df2.dropna(inplace=True) print(df2.tail()) X = df2.drop(columns=['price', 'building_id']) Y = df2.price Y = np.array(Y).reshape(-1)
from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = .20, random_state = 40) X_train.shape, X_test.shape from sklearn.linear_model import LinearRegression lr = LinearRegression()
# Fitting the model lr.fit(X_train, y_train)
# R^2 scores print('Train:', lr.score(X_train, y_train)) print('Validation:', lr.score(X_test, y_test)) from sklearn.metrics import mean_absolute_error print('MAE:', mean_absolute_error(y_test, lr.predict(X_test))) print('Random prediction:', abs(y_test - y_test.mean()).mean() )