Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
| Download
Views: 647
Image: ubuntu2004
Kernel: Python 3 (system-wide)

Library Imports Here

#import pandas here import pandas as pd
#import numpy here import numpy as np
#import sklearn import sklearn
#import matplot import matplotlib.pyplot as plt
#import kmeans from sklearn.cluster import KMeans

Data Frame Reading and Practice

#read in your SAT data sat_data = pd.read_csv("2012_SAT_Results.csv")
#print the first five rows using head() sat_data.head()
DBN SCHOOL NAME Num of SAT Test Takers SAT Critical Reading Avg. Score SAT Math Avg. Score SAT Writing Avg. Score
0 01M292 HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES 29 355 404 363
1 01M448 UNIVERSITY NEIGHBORHOOD HIGH SCHOOL 91 383 423 366
2 01M450 EAST SIDE COMMUNITY SCHOOL 70 377 402 370
3 01M458 FORSYTH SATELLITE ACADEMY 7 414 401 359
4 01M509 MARTA VALLE HIGH SCHOOL 44 390 433 384
#print the last five rows using tail sat_data.tail()
DBN SCHOOL NAME Num of SAT Test Takers SAT Critical Reading Avg. Score SAT Math Avg. Score SAT Writing Avg. Score
473 75X012 P.S. X012 LEWIS AND CLARK SCHOOL s s s s
474 75X754 J. M. RAPPORT SCHOOL CAREER DEVELOPMENT s s s s
475 79M645 SCHOOL FOR COOPERATIVE TECHNICAL EDUCATION s s s s
476 79Q950 GED PLUS s CITYWIDE 8 496 400 426
477 79X490 PHOENIX ACADEMY 9 367 370 360
#find out how many colums and rows our data is sat_data.shape
(478, 6)
#print the columns sat_data.columns
Index(['DBN', 'SCHOOL NAME', 'Num of SAT Test Takers', 'SAT Critical Reading Avg. Score', 'SAT Math Avg. Score', 'SAT Writing Avg. Score'], dtype='object')
#print the types of data
#describe the data
#slice 15 rows of your the data
#slice 15 rows of a column
#slice 15 rows of two columns of data together
#slices 15 rows of three columns of data together
#slices the last 10 rows of three columns of data together
#Find the values of a column of data sat_data["SCHOOL NAME"].value_counts()
WASHINGTON IRVING YABC 1 HILLCREST HIGH SCHOOL 1 LEON M. GOLDSTEIN HIGH SCHOOL FOR THE SCIENCES 1 HIGH SCHOOL FOR HEALTH PROFESSIONS AND HUMAN SERVICES 1 ART AND DESIGN HIGH SCHOOL 1 .. BRONX CAREER AND COLLEGE PREPARATORY HIGH SCHOOL 1 ACADEMY FOR COLLEGE PREPARATION AND CAREER EXPLORATION: A COLLEGE BOARD SCHOOL 1 BENJAMIN BANNEKER ACADEMY 1 PERFORMING ARTS AND TECHNOLOGY HIGH SCHOOL 1 JOHN ADAMS HIGH SCHOOL 1 Name: SCHOOL NAME, Length: 478, dtype: int64
#count the number of eaxh type of value for a column of data
#use conditional formatting to render a part of your data and store it as a variable
#Use 2 conditional formats to render a part of your data and store it as a variable
#count the value types of your stored variables separately

Image Processing

import os
from IPython.display import Image
Image(filename = "machinelearning.png", height = 1000, width = 1000)
Image in a Jupyter notebook
Image(filename = "unsupervised.png", height = 1000, width = 1000)
Image in a Jupyter notebook
Image(filename = "kmeansalg.png", height = 1000, width = 1000)
Image in a Jupyter notebook

Data PreProcessing and Cleaning

sat_data.head()
DBN SCHOOL NAME Num of SAT Test Takers SAT Critical Reading Avg. Score SAT Math Avg. Score SAT Writing Avg. Score
0 01M292 HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES 29 355 404 363
1 01M448 UNIVERSITY NEIGHBORHOOD HIGH SCHOOL 91 383 423 366
2 01M450 EAST SIDE COMMUNITY SCHOOL 70 377 402 370
3 01M458 FORSYTH SATELLITE ACADEMY 7 414 401 359
4 01M509 MARTA VALLE HIGH SCHOOL 44 390 433 384
sat_data.drop("DBN", axis = 1, inplace=True)
sat_data.head()
SCHOOL NAME Num of SAT Test Takers SAT Critical Reading Avg. Score SAT Math Avg. Score SAT Writing Avg. Score
0 HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES 29 355 404 363
1 UNIVERSITY NEIGHBORHOOD HIGH SCHOOL 91 383 423 366
2 EAST SIDE COMMUNITY SCHOOL 70 377 402 370
3 FORSYTH SATELLITE ACADEMY 7 414 401 359
4 MARTA VALLE HIGH SCHOOL 44 390 433 384
sat_data.columns
Index(['SCHOOL NAME', 'Num of SAT Test Takers', 'SAT Critical Reading Avg. Score', 'SAT Math Avg. Score', 'SAT Writing Avg. Score'], dtype='object')
import seaborn as sns
col_list = ['Num of SAT Test Takers','SAT Critical Reading Avg. Score', 'SAT Math Avg. Score', 'SAT Writing Avg. Score']
for col in col_list: sns.countplot(x = col, data = sat_data, palette = 'Set3')# hue = 'Good Loan') plt.xticks(rotation=90) plt.show()
Image in a Jupyter notebookImage in a Jupyter notebookImage in a Jupyter notebookImage in a Jupyter notebook
sat_data.isnull().sum() #isnull returns a boolean value of True or False depending on whether there is an empty cell in our data set #we can use .sum() to count it.
SCHOOL NAME 0 Num of SAT Test Takers 0 SAT Critical Reading Avg. Score 0 SAT Math Avg. Score 0 SAT Writing Avg. Score 0 dtype: int64
sat_data["Num of SAT Test Takers"].unique() #unique tells us values in a column of data that is special or has little to one occurence
array(['29', '91', '70', '7', '44', '112', '159', '18', '130', '16', '62', '53', '58', '85', '48', '76', '50', '40', '69', '42', '60', '92', 's', '79', '263', '54', '94', '104', '114', '66', '103', '127', '144', '336', '84', '95', '59', '72', '49', '151', '832', '167', '25', '81', '264', '131', '73', '14', '78', '26', '77', '56', '30', '33', '121', '9', '335', '36', '83', '154', '191', '270', '61', '27', '41', '12', '32', '261', '531', '75', '35', '111', '43', '375', '51', '31', '20', '214', '101', '55', '63', '24', '228', '65', '34', '64', '28', '47', '52', '67', '39', '415', '6', '68', '80', '74', '38', '113', '86', '57', '443', '731', '109', '99', '10', '46', '97', '189', '37', '1277', '90', '105', '8', '13', '89', '185', '102', '134', '142', '141', '71', '165', '259', '17', '182', '456', '238', '694', '385', '475', '727', '448', '119', '824', '518', '236', '11', '155', '320', '241', '138', '396', '45', '558', '347', '278', '888', '934', '334', '708', '175', '87', '93', '404', '403', '194', '762', '462', '422', '98', '395', '392', '174', '148', '143', '135', '137', '107', '391', '271', '807', '535', '227', '88', '23'], dtype=object)
sat_data.columns
Index(['SCHOOL NAME', 'Num of SAT Test Takers', 'SAT Critical Reading Avg. Score', 'SAT Math Avg. Score', 'SAT Writing Avg. Score'], dtype='object')
sat_data['SAT Critical Reading Avg. Score'].unique()
array(['355', '383', '377', '414', '390', '332', '522', '417', '624', '395', '409', '394', '374', '423', '404', '353', '375', '403', '408', '373', '391', '473', 's', '319', '465', '492', '509', '496', '537', '517', '468', '572', '528', '429', '416', '356', '441', '378', '376', '679', '443', '370', '469', '407', '336', '389', '412', '368', '405', '498', '527', '350', '345', '458', '426', '399', '444', '406', '384', '371', '577', '566', '396', '433', '401', '354', '461', '432', '369', '358', '357', '413', '360', '605', '362', '380', '365', '363', '400', '430', '339', '364', '379', '310', '361', '351', '420', '367', '387', '398', '337', '402', '366', '324', '372', '411', '393', '459', '386', '382', '315', '314', '438', '304', '300', '392', '343', '419', '632', '348', '636', '381', '431', '388', '347', '321', '342', '352', '439', '587', '305', '435', '471', '586', '397', '359', '287', '338', '457', '418', '385', '476', '425', '313', '410', '341', '279', '326', '478', '436', '524', '456', '349', '311', '317', '545', '445', '455', '621', '480', '462', '499', '452', '513', '612', '504', '323', '487', '472', '424', '437', '466', '635', '428'], dtype=object)
sat_data['SAT Math Avg. Score'].unique()
array(['404', '423', '402', '401', '433', '557', '574', '418', '604', '400', '393', '384', '375', '438', '449', '358', '388', '392', '390', '370', '391', '483', 's', '512', '493', '465', '490', '563', '590', '533', '492', '594', '553', '399', '426', '357', '473', '365', '416', '460', '387', '735', '489', '349', '472', '440', '425', '378', '395', '371', '581', '436', '508', '337', '517', '403', '379', '382', '441', '424', '368', '575', '564', '398', '369', '506', '514', '421', '446', '351', '318', '366', '353', '360', '474', '361', '654', '376', '380', '422', '456', '364', '324', '359', '394', '396', '356', '411', '381', '386', '385', '373', '355', '464', '367', '480', '363', '339', '312', '419', '315', '455', '412', '406', '333', '408', '350', '420', '688', '362', '435', '648', '471', '397', '372', '344', '432', '323', '346', '374', '417', '659', '383', '443', '499', '584', '410', '377', '335', '415', '341', '342', '462', '481', '413', '320', '322', '478', '486', '477', '427', '437', '496', '468', '414', '519', '475', '561', '409', '338', '454', '429', '568', '447', '498', '463', '451', '651', '458', '545', '539', '523', '434', '537', '491', '445', '452', '497', '660', '488', '682', '317', '444'], dtype=object)
sat_data['SAT Writing Avg. Score'].unique()
array(['363', '366', '370', '359', '384', '316', '525', '411', '628', '387', '392', '378', '362', '432', '416', '340', '385', '405', '390', '394', '479', 's', '357', '461', '467', '523', '518', '550', '515', '459', '592', '533', '381', '428', '391', '349', '458', '368', '388', '360', '682', '442', '351', '475', '393', '400', '420', '344', '382', '335', '431', '352', '477', '512', '345', '343', '403', '429', '426', '376', '430', '408', '374', '358', '577', '402', '361', '373', '455', '395', '448', '333', '398', '342', '364', '404', '355', '588', '367', '399', '346', '423', '326', '348', '311', '371', '383', '407', '389', '375', '350', '413', '365', '356', '377', '415', '457', '297', '339', '379', '369', '440', '302', '301', '330', '372', '410', '427', '649', '414', '354', '317', '636', '433', '353', '419', '341', '298', '312', '418', '587', '570', '396', '380', '300', '425', '334', '291', '441', '386', '286', '424', '314', '435', '476', '439', '542', '331', '332', '421', '329', '406', '443', '318', '481', '638', '489', '464', '401', '496', '454', '502', '596', '412', '494', '491', '544', '450', '466', '417', '470', '397', '422'], dtype=object)
#prints out the rows of data that have an s entry in the SAT Crit Reading column sat_data[sat_data['SAT Critical Reading Avg. Score'] == "s"]
DBN SCHOOL NAME Num of SAT Test Takers SAT Critical Reading Avg. Score SAT Math Avg. Score SAT Writing Avg. Score
22 02M392 MANHATTAN BUSINESS ACADEMY s s s s
23 02M393 BUSINESS OF SPORTS SCHOOL s s s s
25 02M399 THE HIGH SCHOOL FOR LANGUAGE AND DIPLOMACY s s s s
38 02M427 MANHATTAN ACADEMY FOR ARTS & LANGUAGE s s s s
40 02M437 HUDSON HIGH SCHOOL OF LEARNING TECHNOLOGIES s s s s
41 02M438 INTERNATIONAL HIGH SCHOOL AT UNION SQUARE s s s s
47 02M473 WASHINGTON IRVING YABC s s s s
77 03M402 THE URBAN ASSEMBLY SCHOOL FOR GREEN CAREERS s s s s
78 03M403 THE GLOBAL LEARNING COLLABORATIVE s s s s
81 03M417 FRANK MCCOURT HIGH SCHOOL s s s s
89 03M577 YOUNG ADULT BOROUGH CENTER AT LOUIS D. BRANDEI... s s s s
100 05M362 COLUMBIA SECONDARY SCHOOL FOR MATH, SCIENCE, A... s s s s
111 06M423 HIGH SCHOOL FOR EXCELLENCE AND INNOVATION s s s s
122 07X379 JILL CHAIFETZ TRANSFER HIGH SCHOOL s s s s
123 07X381 BRONX HAVEN HIGH SCHOOL s s s s
136 08X269 BRONX STUDIO SCHOOL FOR WRITERS AND ARTISTS s s s s
143 08X367 ARCHIMEDES ACADEMY FOR MATH, SCIENCE AND TECHN... s s s s
144 08X376 ANTONIA PANTOJA PREPARATORY ACADEMY, A COLLEGE... s s s s
148 08X432 BRONX BRIDGES HIGH SCHOOL s s s s
150 08X507 STEVENSON YABC s s s s
153 08X537 BRONX ARENA HIGH SCHOOL s s s s
167 09X324 BRONX EARLY COLLEGE ACADEMY FOR TEACHING & LEA... s s s s
186 10X319 PROVIDING URBAN LEARNERS SUCCESS IN EDUCATION ... s s s s
201 10X478 LEARNING TO WORK GED AT JOHN F. KENNEDY s s s s
216 11X417 YOUNG ADULT BOROUGH CNTR CHRISTOPHER COLUMBUS HS s s s s
219 11X456 LEARNING TO WORK YABC AT TRUMAN HIGH SCHOOL s s s s
234 12X428 LEARNING TO WORK YABC AT MONROE ACADEMY s s s s
236 12X478 THE CINEMA SCHOOL s s s s
237 12X479 BRONX CAREER AND COLLEGE PREPARATORY HIGH SCHOOL s s s s
254 13K527 URBAN ASSEMBLY INSTITUTE OF MATH AND SCIENCE F... s s s s
259 13K616 BROOKLYN HIGH SCHOOL FOR LEADERSHIP AND COMMUN... s s s s
263 14K322 FOUNDATIONS ACADEMY s s s s
277 14K923 AUTOMOTIVE HIGH SCHOOL YABC s s s s
286 15K529 WEST BROOKLYN COMMUNITY HIGH SCHOOL s s s s
289 15K667 SUNSET PARK HIGH SCHOOL s s s s
295 16K688 THE BROOKLYN ACADEMY OF GLOBAL FINANCE s s s s
298 17K467 ERASMUS YABC s s s s
315 17K751 ACADEMY FOR HEALTH CAREERS s s s s
331 19K420 FRANKLIN K. LANE HIGH SCHOOL s s s s
332 19K431 THOMAS JEFFERSON YABC s s s s
348 20K609 THE URBAN ASSEMBLY SCHOOL FOR CRIMINAL JUSTICE s s s s
354 21K412/21K411 ABRAHAM LINCOLN YABC/LEARNING TO WORK GED AT A... s s s s
355 21K468 KINGSBOROUGH EARLY COLLEGE SCHOOL s s s s
380 24Q457 YOUNG ADULT BOROUGH CENTER AT ARTS AND BUSINES... s s s s
388 24Q744 VOYAGES PREPARATORY s s s s
416 28Q284 YORK EARLY COLLEGE ACADEMY s s s s
418 28Q325 HILLSIDE ARTS & LETTERS ACADEMY s s s s
426 28Q686 QUEENS METROPOLITAN HIGH SCHOOL s s s s
435 29Q326 CAMBRIA HEIGHTS ACADEMY s s s s
440 30Q286 YOUNG WOMEN'S LEADERSHIP SCHOOL, ASTORIA s s s s
468 75K371 P.S. 371 s LILLIAN L. RASHKIS s s s s
469 75M035 P.S. 035 s s s s
470 75Q256 P.S. Q256 s s s s
472 75R025 SOUTH RICHMOND HIGH SCHOOL I.S./P.S. 25 s s s s
473 75X012 P.S. X012 LEWIS AND CLARK SCHOOL s s s s
474 75X754 J. M. RAPPORT SCHOOL CAREER DEVELOPMENT s s s s
475 79M645 SCHOOL FOR COOPERATIVE TECHNICAL EDUCATION s s s s
#Approach 1: Creating a function then using apply def replace_s(row): if row == "s": return 300 else: return row
#Then overwrite current column of the data set sat_data['SAT Writing Avg. Score'] = sat_data['SAT Writing Avg. Score'].apply(replace_s)
sat_data['SAT Math Avg. Score'] = sat_data['SAT Math Avg. Score'].apply(replace_s)
sat_data['SAT Critical Reading Avg. Score'] = sat_data['SAT Critical Reading Avg. Score'].apply(replace_s)
#Check to see sat_data['SAT Writing Avg. Score'].dtypes
dtype('O')
#Condtionally checks for a specific entry in a specific column sat_data[sat_data['SAT Critical Reading Avg. Score'] == "s"]
DBN SCHOOL NAME Num of SAT Test Takers SAT Critical Reading Avg. Score SAT Math Avg. Score SAT Writing Avg. Score
def replace_with_zero(row): if row == "s": return 0 else: return row
sat_data["Num of SAT Test Takers"] = sat_data["Num of SAT Test Takers"].apply(replace_with_zero)
#Condtionally checks for a specific entry in a specific column sat_data[sat_data['Num of SAT Test Takers'] == "s"]
DBN SCHOOL NAME Num of SAT Test Takers SAT Critical Reading Avg. Score SAT Math Avg. Score SAT Writing Avg. Score
#only want numeric data just_the_scores = sat_data[['Num of SAT Test Takers','SAT Critical Reading Avg. Score', 'SAT Math Avg. Score', 'SAT Writing Avg. Score']]
just_the_scores.head()
Num of SAT Test Takers SAT Critical Reading Avg. Score SAT Math Avg. Score SAT Writing Avg. Score
0 29 355 404 363
1 91 383 423 366
2 70 377 402 370
3 7 414 401 359
4 44 390 433 384
just_the_scores.dtypes
Num of SAT Test Takers object SAT Critical Reading Avg. Score object SAT Math Avg. Score object SAT Writing Avg. Score object dtype: object
just_the_scores["Num of SAT Test Takers"].unique()
array(['29', '91', '70', '7', '44', '112', '159', '18', '130', '16', '62', '53', '58', '85', '48', '76', '50', '40', '69', '42', '60', '92', 0, '79', '263', '54', '94', '104', '114', '66', '103', '127', '144', '336', '84', '95', '59', '72', '49', '151', '832', '167', '25', '81', '264', '131', '73', '14', '78', '26', '77', '56', '30', '33', '121', '9', '335', '36', '83', '154', '191', '270', '61', '27', '41', '12', '32', '261', '531', '75', '35', '111', '43', '375', '51', '31', '20', '214', '101', '55', '63', '24', '228', '65', '34', '64', '28', '47', '52', '67', '39', '415', '6', '68', '80', '74', '38', '113', '86', '57', '443', '731', '109', '99', '10', '46', '97', '189', '37', '1277', '90', '105', '8', '13', '89', '185', '102', '134', '142', '141', '71', '165', '259', '17', '182', '456', '238', '694', '385', '475', '727', '448', '119', '824', '518', '236', '11', '155', '320', '241', '138', '396', '45', '558', '347', '278', '888', '934', '334', '708', '175', '87', '93', '404', '403', '194', '762', '462', '422', '98', '395', '392', '174', '148', '143', '135', '137', '107', '391', '271', '807', '535', '227', '88', '23'], dtype=object)
just_the_scores["Num of SAT Test Takers"].astype(int)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-34-eef83cd6674b> in <module> ----> 1 just_the_scores["Num of SAT Test Takers"].astype(int) /usr/local/lib/python3.8/dist-packages/pandas/core/generic.py in astype(self, dtype, copy, errors) 5541 else: 5542 # else, only a single dtype is given -> 5543 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors,) 5544 return self._constructor(new_data).__finalize__(self, method="astype") 5545 /usr/local/lib/python3.8/dist-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors) 593 self, dtype, copy: bool = False, errors: str = "raise" 594 ) -> "BlockManager": --> 595 return self.apply("astype", dtype=dtype, copy=copy, errors=errors) 596 597 def convert( /usr/local/lib/python3.8/dist-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, **kwargs) 404 applied = b.apply(f, **kwargs) 405 else: --> 406 applied = getattr(b, f)(**kwargs) 407 result_blocks = _extend_blocks(applied, result_blocks) 408 /usr/local/lib/python3.8/dist-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors) 592 vals1d = values.ravel() 593 try: --> 594 values = astype_nansafe(vals1d, dtype, copy=True) 595 except (ValueError, TypeError): 596 # e.g. astype_nansafe can fail on object-dtype of strings /usr/local/lib/python3.8/dist-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna) 965 # work around NumPy brokenness, #1987 966 if np.issubdtype(dtype.type, np.integer): --> 967 return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape) 968 969 # if we have a datetime/timedelta array of objects pandas/_libs/lib.pyx in pandas._libs.lib.astype_intsafe() ValueError: invalid literal for int() with base 10: 's'
#Fit the data X_train = just_the_scores.values
kmeans = KMeans(n_clusters = 5, random_state = 100) kmeans = kmeans.fit(X_train)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-26-d4efc6e7ac9c> in <module> 1 kmeans = KMeans(n_clusters = 5, random_state = 100) ----> 2 kmeans = kmeans.fit(X_train) /usr/local/lib/python3.8/dist-packages/sklearn/cluster/_kmeans.py in fit(self, X, y, sample_weight) 1028 Fitted estimator. 1029 """ -> 1030 X = self._validate_data(X, accept_sparse='csr', 1031 dtype=[np.float64, np.float32], 1032 order='C', copy=self.copy_x, /usr/local/lib/python3.8/dist-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params) 418 f"requires y to be passed, but the target y is None." 419 ) --> 420 X = check_array(X, **check_params) 421 out = X 422 else: /usr/local/lib/python3.8/dist-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs) 70 FutureWarning) 71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)}) ---> 72 return f(**kwargs) 73 return inner_f 74 /usr/local/lib/python3.8/dist-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator) 596 array = array.astype(dtype, casting="unsafe", copy=False) 597 else: --> 598 array = np.asarray(array, order=order, dtype=dtype) 599 except ComplexWarning: 600 raise ValueError("Complex data not supported\n" /usr/local/lib/python3.8/dist-packages/numpy/core/_asarray.py in asarray(a, dtype, order) 83 84 """ ---> 85 return array(a, dtype, copy=False, order=order) 86 87 ValueError: could not convert string to float: 's'