added script to scrape packages from r-projects.org

2025-06-02 18:42:29 -04:00 · 2014-07-17 12:56:06 -04:00 · 2014-07-17 12:56:06 -04:00 · 704b1c5629
commit 704b1c5629
parent 5a7f7678ca
1 changed files with 23 additions and 0 deletions
--- a/scripts/pull_R_packages.py
+++ b/scripts/pull_R_packages.py
@ -0,0 +1,23 @@
+#!/usr/bin/python
+
+"""
+	This script will scrape the r-project.org machine learning selection and format the packages
+	in github markdown style for this awesome-machine-learning repo.
+"""
+
+from pyquery import PyQuery as pq
+import urllib
+
+
+d = pq(url='http://cran.r-project.org/web/views/MachineLearning.html',opener=lambda url, **kw: urllib.urlopen(url).read())
+index = 0
+for e in d("li").items():
+	package_name = e("a").html()
+	package_link = e("a")[0].attrib['href']
+	if '..' in package_link:
+		package_link = package_link.replace("..",'http://cran.r-project.org/web')
+		dd = pq(url=package_link,opener=lambda url, **kw: urllib.urlopen(url).read())
+		package_description = dd("h2").html()
+		print "* [%s](%s) - %s" % (package_name,package_link,package_description)
+
+	index += 1