lowfi/src/scrape.rs

70 lines
2.1 KiB
Rust
Raw Normal View History

2024-09-23 21:59:07 +02:00
use std::sync::LazyLock;
use futures::{stream::FuturesUnordered, StreamExt};
use scraper::{Html, Selector};
static SELECTOR: LazyLock<Selector> = LazyLock::new(|| {
Selector::parse("html > body > pre > a").unwrap()
});
async fn parse(path: &str) -> eyre::Result<Vec<String>> {
let response = reqwest::get(format!("https://lofigirl.com/wp-content/uploads/{}", path)).await?;
let document = response.text().await?;
let html = Html::parse_document(&document);
Ok(html.select(&SELECTOR).skip(5).map(|x| String::from(x.attr("href").unwrap())).collect())
}
/// This function basically just scans the entire file server, and returns a list of paths to mp3 files.
///
/// It's a bit hacky, and basically works by checking all of the years, then months, and then all of the files.
/// This is done as a way to avoid recursion, since async rust really hates recursive functions.
async fn scan() -> eyre::Result<Vec<String>> {
let items = parse("").await?;
let years: Vec<u32> = items.iter().filter_map(|x| {
let year = x.strip_suffix("/")?;
year.parse().ok()
}).collect();
// A little bit of async to run all of the months concurrently.
let mut futures = FuturesUnordered::new();
for year in years {
let months = parse(&year.to_string()).await?;
for month in months {
futures.push(async move {
let path = format!("{}/{}", year, month);
let items = parse(&path).await.unwrap();
let items = items.into_iter().filter_map(|x| {
if x.ends_with(".mp3") {
Some(format!("{path}{x}"))
} else {
None
}
}).collect::<Vec<String>>();
items
});
}
}
let mut files = Vec::new();
while let Some(mut result) = futures.next().await {
files.append(&mut result);
}
eyre::Result::Ok(files)
}
pub async fn scrape() -> eyre::Result<()> {
let files = scan().await?;
for file in files {
println!("{}", file);
}
Ok(())
}