A GitHub repository recommender. Name still under construction. I am Jamis.
jamis.johnson@columbia.edu | github.com/jamiis | @jamisjohnson
#!/usr/bin/env python from datetime import datetime as dt from getpass import getpass from time import sleep import github3 from github3.exceptions import ForbiddenError, GitHubError, ServerError # authenticate with github uname = raw_input("Github username[jamiis]:") or "jamiis" pwd = getpass() gh = github3.login(uname, pwd) def sleep_on_rate_limit(err): '''sleep until github api requests reset''' # note: dumb way to check error type but there is no rate limit specific err if err.message == "API rate limit exceeded for %s." % uname: # check rate limit every 5 minute retry_in = 60*5 while True: if gh.rate_limit()['rate']['remaining']: return print 'continue in', retry_in/60.0, 'minutes' sleep(retry_in) def prevail(gen): ''' forces generator to continue even on error. very naive! don't use unless you really want to brute force the generator to continue. ''' while True: try: yield next(gen) except StopIteration: raise # catches all github3.exceptions except GitHubError as e: print e.__class__.__name__, e sleep_on_rate_limit(e) pass with open('data/stargazers', 'a') as f: since = raw_input("Continue from repo ID[None]:") or None for user in prevail(gh.all_users(since=since)): print '%s\t%s' % (user.id, user.login) for repo in prevail(user.starred_repositories()): f.write("%s::%s::%s::%s\n" % ( user.login, user.id, repo.full_name, repo.id))
$ tail -f data/stargazers rlenglet::126329::openstack/neutron::2400289 rlenglet::126329::rlenglet/openfaucet::1270260 rlenglet::126329::ubf/ubf-abnf::309879 rlenglet::126329::rlenglet/logtilla::305584 rlenglet::126329::ubf/ubf::173366 iotae::126330::iotae/Rails-Test::305579 iotae::126330::jrallison/authlogic_oauth::214049 iotae::126330::kalasjocke/authlogic-facebook-connect::190556 iotae::126330::binarylogic/authlogic_openid::156775 iotae::126330::insoshi/insoshi::6313Approximately 4% of github's entire users population
If I got hit by a bus it would be very hard to reverse engineer non-Spark-SQL code
#!/usr/bin/env python # https://spark.apache.org/docs/1.2.0/mllib-collaborative-filtering.html from pyspark.mllib.recommendation import ALS from numpy import array from pyspark import SparkContext from pprint import pprint import os, sys def extract_user_repo(line, fieldtype=float): line = line.split('::') if fieldtype is float: # user.id, repo.id fields = [line[1], line[3]] elif fieldtype in [unicode,str]: # user.login, repo.full_name fields = [line[0], line[2]] return array([fieldtype(f) for f in fields]) if __name__ == "__main__": sc = SparkContext(appName='TweetSentiment') env = os.getenv('ENV', 'dev') if env == 'dev': lines = sys.argv[1] if len(sys.argv) >= 2 else '100' text = 'data/stargazers.%sk' % lines elif env == 'prod': text = 's3n://jamis.bucket/stargazers' # load and parse the text file data = sc.textFile(text) starpairs = data.map(extract_user_repo) starpairs.cache() users = starpairs.map(lambda t: t[0]).distinct() # get 1% most popular repos repos = starpairs.map(lambda t: t[1]).distinct() sample = int(0.01 * repos.count()) top_repos = starpairs\ .groupBy(lambda t: t[1])\ .sortBy(lambda t: len(t[1]), False)\ .map(lambda t: t[0])\ .take(sample) top_repos_rdd = sc.parallelize(top_repos) top_repos_rdd.cache() top_repos_bc = sc.broadcast(top_repos) pprint(top_repos[:5]) starpairs_filtered = starpairs.filter(lambda t: t[1] in top_repos_bc.value) starpairs_filtered.cache() # train recommendation model using alternating least squares stars_with_rating = starpairs_filtered.map(lambda t: array([t[0], t[1], 1])) model = ALS.trainImplicit(stars_with_rating, rank=1) # get all user->repo pairs without stars users_repos = users.cartesian(top_repos_rdd).groupByKey() stars_grouped = starpairs_filtered.groupByKey() unstarred = users_repos.join(stars_grouped)\ .map(lambda i: (i[0], set(i[1][0]) - set(i[1][1]) ))\ .flatMap(lambda i: [ (i[0], repo) for repo in i[1] ] ) # predict unstarred user-repo pairs. predictions = model.predictAll(unstarred) # for each user, associate the 5 repos with the highest predicted rating. top = predictions\ .map(lambda t: (t[0], (t[1],t[2])))\ .groupByKey()\ .map(lambda t: (t[0], [i[0] for i in sorted(t[1], key=lambda i: -i[1])[:5]]))\ .coalesce(1) if env == 'dev': top.saveAsTextFile('data/recommendations.%sk' % lines) elif env == 'prod': top.saveAsTextFile('s3n://jamis.bucket/recommendations')