rust/src/main.rs

391 lines
16 KiB
Rust
Raw Normal View History

2020-01-10 23:07:36 +00:00
use pulldown_cmark::{Parser, Event, Tag};
use std::fs;
2020-01-13 22:13:00 +00:00
use futures::future::{select_all, BoxFuture, FutureExt};
2020-01-10 23:07:36 +00:00
use std::collections::{BTreeSet, BTreeMap};
use serde::{Serialize, Deserialize};
use lazy_static::lazy_static;
use std::time;
use log::{warn, debug, info};
use std::io::Write;
use reqwest::{Client, redirect::Policy, StatusCode, header, Url};
2020-01-13 22:13:00 +00:00
use regex::Regex;
use failure::{Fail, Error, format_err};
use chrono::{Local, DateTime, Duration};
use std::env;
use tokio::sync::Semaphore;
use tokio::sync::SemaphorePermit;
#[derive(Debug, Fail, Serialize, Deserialize)]
enum CheckerError {
#[fail(display = "failed to try url")]
NotTried, // Generally shouldn't happen, but useful to have
#[fail(display = "http error: {}", status)]
HttpError {
status: u16,
location: Option<String>,
},
#[fail(display = "reqwest error: {}", error)]
ReqwestError {
error: String,
2020-01-20 17:53:24 +00:00
},
#[fail(display = "travis build is unknown")]
TravisBuildUnknown,
2020-02-12 23:38:26 +00:00
#[fail(display = "travis build image with no branch")]
TravisBuildNoBranch,
#[fail(display = "github actions image with no branch")]
GithubActionNoBranch,
}
fn formatter(err: &CheckerError, url: &String) -> String {
match err {
CheckerError::HttpError {status, location} => {
match location {
Some(loc) => {
format!("[{}] {} -> {}", status, url, loc)
}
None => {
format!("[{}] {}", status, url)
}
}
}
CheckerError::TravisBuildUnknown => {
format!("[Unknown travis build] {}", url)
}
CheckerError::TravisBuildNoBranch => {
format!("[Travis build image with no branch specified] {}", url)
}
CheckerError::GithubActionNoBranch => {
format!("[Github action image with no branch specified] {}", url)
}
_ => {
format!("{:?}", err)
}
}
}
struct MaxHandles {
remaining: Semaphore
}
struct Handle<'a> {
_permit: SemaphorePermit<'a>
}
impl MaxHandles {
fn new(max: usize) -> MaxHandles {
MaxHandles { remaining: Semaphore::new(max) }
}
async fn get<'a>(&'a self) -> Handle<'a> {
2021-05-05 21:21:23 +00:00
let permit = self.remaining.acquire().await.unwrap();
return Handle { _permit: permit };
}
}
impl<'a> Drop for Handle<'a> {
fn drop(&mut self) {
debug!("Dropping");
}
}
lazy_static! {
2020-01-13 08:39:38 +00:00
static ref CLIENT: Client = Client::builder()
.danger_accept_invalid_certs(true) // because some certs are out of date
.user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:68.0) Gecko/20100101 Firefox/68.0") // so some sites (e.g. sciter.com) don't reject us
2020-01-13 08:39:38 +00:00
.redirect(Policy::none())
2021-05-05 21:21:23 +00:00
.pool_max_idle_per_host(0)
2020-01-13 22:13:13 +00:00
.timeout(time::Duration::from_secs(20))
.build().unwrap();
// This is to avoid errors with running out of file handles, so we only do 20 requests at a time
static ref HANDLES: MaxHandles = MaxHandles::new(20);
}
2020-01-10 23:07:36 +00:00
fn get_url(url: String) -> BoxFuture<'static, (String, Result<(), CheckerError>)> {
debug!("Need handle for {}", url);
2020-01-13 22:13:00 +00:00
async move {
let _handle = HANDLES.get().await;
return get_url_core(url).await;
}.boxed()
}
fn get_url_core(url: String) -> BoxFuture<'static, (String, Result<(), CheckerError>)> {
async move {
let mut res = Err(CheckerError::NotTried);
2020-01-13 22:13:00 +00:00
for _ in 0..5u8 {
debug!("Running {}", url);
lazy_static! {
static ref GITHUB_REPO_REGEX: Regex = Regex::new(r"^https://github.com/(?P<org>[^/]+)/(?P<repo>[^/]+)$").unwrap();
static ref GITHUB_API_REGEX: Regex = Regex::new(r"https://api.github.com/").unwrap();
}
if env::var("GITHUB_USERNAME").is_ok() && env::var("GITHUB_TOKEN").is_ok() && GITHUB_REPO_REGEX.is_match(&url) {
let rewritten = GITHUB_REPO_REGEX.replace_all(&url, "https://api.github.com/repos/$org/$repo");
info!("Replacing {} with {} to workaround rate limits on Github", url, rewritten);
let (_new_url, res) = get_url_core(rewritten.to_string()).await;
return (url, res);
}
let mut req = CLIENT
2020-01-13 22:13:00 +00:00
.get(&url)
.header(header::ACCEPT, "image/svg+xml, text/html, */*;q=0.8");
if GITHUB_API_REGEX.is_match(&url) {
if let Ok(username) = env::var("GITHUB_USERNAME") {
if let Ok(password) = env::var("GITHUB_TOKEN") {
// needs a token with at least public_repo scope
info!("Using basic auth for {}", url);
req = req.basic_auth(username, Some(password));
}
}
}
let resp = req.send().await;
2020-01-13 22:13:00 +00:00
match resp {
Err(err) => {
warn!("Error while getting {}, retrying: {}", url, err);
res = Err(CheckerError::ReqwestError{error: err.to_string()});
2020-01-13 08:39:38 +00:00
continue;
}
Ok(ok) => {
2020-01-13 22:13:00 +00:00
let status = ok.status();
if status != StatusCode::OK {
lazy_static! {
static ref ACTIONS_REGEX: Regex = Regex::new(r"https://github.com/(?P<org>[^/]+)/(?P<repo>[^/]+)/actions(?:\?workflow=.+)?").unwrap();
2021-06-13 15:07:34 +00:00
static ref YOUTUBE_VIDEO_REGEX: Regex = Regex::new(r"https://www.youtube.com/watch\?v=(?P<video_id>.+)").unwrap();
static ref YOUTUBE_PLAYLIST_REGEX: Regex = Regex::new(r"https://www.youtube.com/playlist\?list=(?P<playlist_id>.+)").unwrap();
static ref YOUTUBE_CONSENT_REGEX: Regex = Regex::new(r"https://consent.youtube.com/m\?continue=.+").unwrap();
static ref AZURE_BUILD_REGEX: Regex = Regex::new(r"https://dev.azure.com/[^/]+/[^/]+/_build").unwrap();
2020-01-13 22:13:00 +00:00
}
if status == StatusCode::NOT_FOUND && ACTIONS_REGEX.is_match(&url) {
let rewritten = ACTIONS_REGEX.replace_all(&url, "https://github.com/$org/$repo");
warn!("Got 404 with Github actions, so replacing {} with {}", url, rewritten);
let (_new_url, res) = get_url_core(rewritten.to_string()).await;
2020-01-13 22:13:00 +00:00
return (url, res);
}
2021-06-13 15:07:34 +00:00
if status == StatusCode::FOUND && YOUTUBE_VIDEO_REGEX.is_match(&url) {
// Based off of https://gist.github.com/tonY1883/a3b85925081688de569b779b4657439b
// Guesswork is that the img feed will cause less 302's than the main url
// See https://github.com/rust-unofficial/awesome-rust/issues/814 for original issue
2021-06-13 15:07:34 +00:00
let rewritten = YOUTUBE_VIDEO_REGEX.replace_all(&url, "http://img.youtube.com/vi/$video_id/mqdefault.jpg");
warn!("Got 302 with Youtube, so replacing {} with {}", url, rewritten);
let (_new_url, res) = get_url_core(rewritten.to_string()).await;
return (url, res);
};
2021-06-13 15:07:34 +00:00
if status == StatusCode::FOUND && YOUTUBE_PLAYLIST_REGEX.is_match(&url) {
let location = ok.headers().get("LOCATION").map(|h| h.to_str().unwrap()).unwrap_or_default();
if YOUTUBE_CONSENT_REGEX.is_match(location) {
warn!("Got Youtube consent link for {}, so assuming playlist is ok", url);
return (url, Ok(()));
}
};
if status == StatusCode::FOUND && AZURE_BUILD_REGEX.is_match(&url) {
// Azure build urls always redirect to a particular build id, so no stable url guarantees
let redirect = ok.headers().get(header::LOCATION).unwrap().to_str().unwrap();
let merged_url = Url::parse(&url).unwrap().join(redirect).unwrap();
info!("Got 302 from Azure devops, so replacing {} with {}", url, merged_url);
let (_new_url, res) = get_url_core(merged_url.into_string()).await;
return (url, res);
}
2020-01-13 22:13:00 +00:00
warn!("Error while getting {}, retrying: {}", url, status);
if status.is_redirection() {
res = Err(CheckerError::HttpError {status: status.as_u16(), location: ok.headers().get(header::LOCATION).and_then(|h| h.to_str().ok()).map(|x| x.to_string())});
2020-01-20 17:53:24 +00:00
break;
2020-01-13 22:13:00 +00:00
} else {
res = Err(CheckerError::HttpError {status: status.as_u16(), location: None});
2020-01-20 17:53:24 +00:00
continue;
}
}
lazy_static! {
2020-02-12 23:38:26 +00:00
static ref TRAVIS_IMG_REGEX: Regex = Regex::new(r"https://api.travis-ci.(?:com|org)/[^/]+/.+\.svg(\?.+)?").unwrap();
static ref GITHUB_ACTIONS_REGEX: Regex = Regex::new(r"https://github.com/[^/]+/[^/]+/workflows/[^/]+/badge.svg(\?.+)?").unwrap();
2020-01-20 17:53:24 +00:00
}
2020-02-12 23:38:26 +00:00
if let Some(matches) = TRAVIS_IMG_REGEX.captures(&url) {
// Previously we checked the Content-Disposition headers, but sometimes that is incorrect
// We're now looking for the explicit text "unknown" in the middle of the SVG
let content = ok.text().await.unwrap();
if content.contains("unknown") {
2020-01-20 17:53:24 +00:00
res = Err(CheckerError::TravisBuildUnknown);
break;
2020-01-13 22:13:00 +00:00
}
2020-02-12 23:38:26 +00:00
let query = matches.get(1).map(|x| x.as_str()).unwrap_or("");
if !query.starts_with("?") || query.find("branch=").is_none() {
res = Err(CheckerError::TravisBuildNoBranch);
break;
}
2020-01-13 22:13:00 +00:00
}
if let Some(matches) = GITHUB_ACTIONS_REGEX.captures(&url) {
debug!("Github actions match {:?}", matches);
let query = matches.get(1).map(|x| x.as_str()).unwrap_or("");
if !query.starts_with("?") || query.find("branch=").is_none() {
res = Err(CheckerError::GithubActionNoBranch);
break;
}
2020-01-13 22:13:00 +00:00
}
debug!("Finished {}", url);
res = Ok(());
break;
2020-01-13 22:13:00 +00:00
}
2020-01-13 08:39:38 +00:00
}
}
2020-01-13 22:13:00 +00:00
(url, res)
}.boxed()
2020-01-10 23:07:36 +00:00
}
#[derive(Debug, Serialize, Deserialize)]
enum Working {
Yes,
No(CheckerError)
}
2020-01-10 23:07:36 +00:00
#[derive(Debug, Serialize, Deserialize)]
struct Link {
last_working: Option<DateTime<Local>>,
updated_at: DateTime<Local>,
working: Working,
2020-01-10 23:07:36 +00:00
}
type Results = BTreeMap<String, Link>;
2020-01-10 23:07:36 +00:00
#[tokio::main]
async fn main() -> Result<(), Error> {
env_logger::init();
2020-01-10 23:07:36 +00:00
let markdown_input = fs::read_to_string("README.md").expect("Can't read README.md");
let parser = Parser::new(&markdown_input);
let mut used: BTreeSet<String> = BTreeSet::new();
let mut results: Results = fs::read_to_string("results/results.yaml")
.map_err(|e| format_err!("{}", e))
.and_then(|x| serde_yaml::from_str(&x).map_err(|e| format_err!("{}", e)))
.unwrap_or(Results::new());
2020-01-10 23:07:36 +00:00
let mut url_checks = vec![];
let min_between_checks: Duration = Duration::days(3);
let max_allowed_failed: Duration = Duration::days(7);
let mut do_check = |url: String| {
if !url.starts_with("http") {
return;
}
used.insert(url.clone());
if let Some(link) = results.get(&url) {
if let Working::Yes = link.working {
let since = Local::now() - link.updated_at;
if since < min_between_checks {
return;
}
}
}
let check = get_url(url).boxed();
url_checks.push(check);
};
2020-01-10 23:07:36 +00:00
for (event, _range) in parser.into_offset_iter() {
match event {
Event::Start(tag) => {
match tag {
Tag::Link(_link_type, url, _title) | Tag::Image(_link_type, url, _title) => {
do_check(url.to_string());
2020-01-13 17:04:46 +00:00
}
_ => {}
}
}
Event::Html(content) => {
return Err(format_err!("Contains HTML content, not markdown: {}", content));
2020-01-10 23:07:36 +00:00
}
_ => {}
2020-01-10 23:07:36 +00:00
}
}
let results_keys = results.keys().cloned().collect::<BTreeSet<String>>();
let old_links = results_keys.difference(&used);
for link in old_links {
results.remove(link).unwrap();
}
fs::write("results/results.yaml", serde_yaml::to_string(&results)?)?;
2020-03-08 16:49:04 +00:00
let mut not_written = 0;
let mut last_written = Local::now();
2020-01-10 23:07:36 +00:00
while url_checks.len() > 0 {
2020-06-02 21:24:49 +00:00
debug!("Waiting for {}", url_checks.len());
2020-01-10 23:07:36 +00:00
let ((url, res), _index, remaining) = select_all(url_checks).await;
url_checks = remaining;
match res {
Ok(_) => {
print!("\u{2714} ");
if let Some(link) = results.get_mut(&url) {
link.updated_at = Local::now();
link.last_working = Some(Local::now());
link.working = Working::Yes;
} else {
results.insert(url.clone(), Link {
updated_at: Local::now(),
last_working: Some(Local::now()),
working: Working::Yes
});
}
2020-01-10 23:07:36 +00:00
},
Err(err) => {
print!("\u{2718} ");
if let Some(link) = results.get_mut(&url) {
link.updated_at = Local::now();
link.working = Working::No(err);
} else {
results.insert(url.clone(), Link {
updated_at: Local::now(),
working: Working::No(err),
last_working: None
});
}
2020-01-10 23:07:36 +00:00
}
}
std::io::stdout().flush().unwrap();
2020-03-08 16:49:04 +00:00
not_written += 1;
let duration = Local::now() - last_written;
if duration > Duration::seconds(5) || not_written > 20 {
fs::write("results/results.yaml", serde_yaml::to_string(&results)?)?;
not_written = 0;
last_written = Local::now();
}
2020-01-10 23:07:36 +00:00
}
2020-03-08 16:49:04 +00:00
fs::write("results/results.yaml", serde_yaml::to_string(&results)?)?;
2020-01-10 23:07:36 +00:00
println!("");
let mut failed: u32 = 0;
for (url, link) in results.iter() {
if let Working::No(ref err) = link.working {
match err {
2020-04-30 23:20:27 +00:00
CheckerError::HttpError {status, ..} if *status == 301 || *status == 302 || *status == 404 => {
println!("{} {:?}", url, link);
failed +=1;
continue;
}
_ => {}
};
if let Some(last_working) = link.last_working {
let since = Local::now() - last_working;
if since > max_allowed_failed {
2020-04-21 10:04:59 +00:00
println!("{} {:?}", url, link);
failed +=1;
} else {
println!("Failure occurred but only {}, so we're not worrying yet: {}", chrono_humanize::HumanTime::from(-since), formatter(err, url));
}
} else {
println!("{} {:?}", url, link);
failed +=1;
continue;
}
}
}
if failed == 0 {
println!("No errors!");
Ok(())
} else {
Err(format_err!("{} urls with errors", failed))
2020-01-10 23:07:36 +00:00
}
}