mirror of
https://github.com/josephmisiti/awesome-machine-learning.git
synced 2025-06-02 18:42:29 -04:00
added script to scrape packages from r-projects.org
This commit is contained in:
parent
5a7f7678ca
commit
704b1c5629
1 changed files with 23 additions and 0 deletions
23
scripts/pull_R_packages.py
Executable file
23
scripts/pull_R_packages.py
Executable file
|
@ -0,0 +1,23 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
"""
|
||||
This script will scrape the r-project.org machine learning selection and format the packages
|
||||
in github markdown style for this awesome-machine-learning repo.
|
||||
"""
|
||||
|
||||
from pyquery import PyQuery as pq
|
||||
import urllib
|
||||
|
||||
|
||||
d = pq(url='http://cran.r-project.org/web/views/MachineLearning.html',opener=lambda url, **kw: urllib.urlopen(url).read())
|
||||
index = 0
|
||||
for e in d("li").items():
|
||||
package_name = e("a").html()
|
||||
package_link = e("a")[0].attrib['href']
|
||||
if '..' in package_link:
|
||||
package_link = package_link.replace("..",'http://cran.r-project.org/web')
|
||||
dd = pq(url=package_link,opener=lambda url, **kw: urllib.urlopen(url).read())
|
||||
package_description = dd("h2").html()
|
||||
print "* [%s](%s) - %s" % (package_name,package_link,package_description)
|
||||
|
||||
index += 1
|
Loading…
Add table
Add a link
Reference in a new issue