2014-07-17 12:56:06 -04:00
|
|
|
#!/usr/bin/python
|
|
|
|
|
|
|
|
"""
|
2017-08-03 02:36:23 -04:00
|
|
|
This script will scrape the r-project.org machine learning selection and
|
|
|
|
format the packages in github markdown style for this
|
|
|
|
awesome-machine-learning repo.
|
2014-07-17 12:56:06 -04:00
|
|
|
"""
|
|
|
|
|
|
|
|
from pyquery import PyQuery as pq
|
|
|
|
import urllib
|
2015-02-07 15:04:10 -05:00
|
|
|
import codecs
|
2014-07-17 12:56:06 -04:00
|
|
|
|
2017-08-03 02:36:23 -04:00
|
|
|
text_file = codecs.open("Packages.txt", encoding='utf-8', mode="w")
|
|
|
|
d = pq(url='http://cran.r-project.org/web/views/MachineLearning.html',
|
|
|
|
opener=lambda url, **kw: urllib.urlopen(url).read())
|
2015-02-07 15:04:10 -05:00
|
|
|
|
2014-07-17 12:56:06 -04:00
|
|
|
for e in d("li").items():
|
2017-08-03 02:36:23 -04:00
|
|
|
package_name = e("a").html()
|
|
|
|
package_link = e("a")[0].attrib['href']
|
|
|
|
if '..' in package_link:
|
|
|
|
package_link = package_link.replace("..",
|
|
|
|
'http://cran.r-project.org/web')
|
|
|
|
dd = pq(url=package_link, opener=lambda url,
|
|
|
|
**kw: urllib.urlopen(url).read())
|
|
|
|
package_description = dd("h2").html()
|
|
|
|
text_file.write(" [%s](%s) - %s \n" % (package_name, package_link,
|
|
|
|
package_description))
|
|
|
|
# print("* [%s](%s) - %s" % (package_name,package_link,
|
|
|
|
# package_description))
|