Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download
Views: 3191
1
#github data scrapper
2
3
"""
4
variables of interest:
5
indp. variables
6
- language, given as a binary variable. Need 4 positions for 5 languages
7
- #number of days created ago, 1 position
8
- has wiki? Boolean, 1 position
9
- followers, 1 position
10
- following, 1 position
11
- constant
12
13
dep. variables
14
-stars/watchers
15
-forks
16
17
"""
18
from json import loads
19
import datetime
20
import numpy as np
21
from requests import get
22
23
24
MAX = 8000000
25
today = datetime.datetime.today()
26
randint = np.random.randint
27
N = 120 #sample size.
28
auth = ("username", "password" )
29
30
language_mappings = {"Python": 0, "JavaScript": 1, "Ruby": 2, "Java":3, "Shell":4, "PHP":5}
31
32
#define data matrix:
33
X = np.zeros( (N , 12), dtype = int )
34
35
for i in xrange(N):
36
is_fork = True
37
is_valid_language = False
38
39
while is_fork == True or is_valid_language == False:
40
is_fork = True
41
is_valid_language = False
42
43
params = {"since":randint(0, MAX ) }
44
r = get("https://api.github.com/repositories", params = params, auth=auth )
45
results = loads( r.text )[0]
46
#im only interested in the first one, and if it is not a fork.
47
is_fork = results["fork"]
48
49
r = get( results["url"], auth = auth)
50
51
#check the language
52
repo_results = loads( r.text )
53
try:
54
language_mappings[ repo_results["language" ] ]
55
is_valid_language = True
56
except:
57
pass
58
59
#languages
60
X[ i, language_mappings[ repo_results["language" ] ] ] = 1
61
62
#delta time
63
X[ i, 6] = ( today - datetime.datetime.strptime( repo_results["created_at"][:10], "%Y-%m-%d" ) ).days
64
65
#haswiki
66
X[i, 7] = repo_results["has_wiki"]
67
68
#get user information
69
r = get( results["owner"]["url"] , auth = auth)
70
user_results = loads( r.text )
71
X[i, 8] = user_results["following"]
72
X[i, 9] = user_results["followers"]
73
74
#get dep. data
75
X[i, 10] = repo_results["watchers_count"]
76
X[i, 11] = repo_results["forks_count"]
77
print
78
print " -------------- "
79
print i, ": ", results["full_name"], repo_results["language" ], repo_results["watchers_count"], repo_results["forks_count"]
80
print " -------------- "
81
print
82
83
np.savetxt("data/github_data.csv", X, delimiter=",", fmt="%d" )
84
85