2024-09-27 14:21:49 +02:00
|
|
|
//! Has all of the functions for the `scrape` command.
|
|
|
|
|
2024-09-23 21:59:07 +02:00
|
|
|
use std::sync::LazyLock;
|
|
|
|
|
|
|
|
use futures::{stream::FuturesUnordered, StreamExt};
|
|
|
|
use scraper::{Html, Selector};
|
|
|
|
|
2024-09-27 14:21:49 +02:00
|
|
|
const BASE_URL: &str = "https://lofigirl.com/wp-content/uploads/";
|
2024-09-25 23:54:55 +02:00
|
|
|
|
2024-09-25 18:11:42 +02:00
|
|
|
static SELECTOR: LazyLock<Selector> =
|
|
|
|
LazyLock::new(|| Selector::parse("html > body > pre > a").unwrap());
|
2024-09-23 21:59:07 +02:00
|
|
|
|
|
|
|
async fn parse(path: &str) -> eyre::Result<Vec<String>> {
|
2024-09-25 23:54:55 +02:00
|
|
|
let response = reqwest::get(format!("{}{}", BASE_URL, path)).await?;
|
2024-09-23 21:59:07 +02:00
|
|
|
let document = response.text().await?;
|
|
|
|
|
|
|
|
let html = Html::parse_document(&document);
|
2024-09-25 18:11:42 +02:00
|
|
|
Ok(html
|
|
|
|
.select(&SELECTOR)
|
|
|
|
.skip(5)
|
|
|
|
.map(|x| String::from(x.attr("href").unwrap()))
|
|
|
|
.collect())
|
2024-09-23 21:59:07 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/// This function basically just scans the entire file server, and returns a list of paths to mp3 files.
|
2024-09-25 18:11:42 +02:00
|
|
|
///
|
2024-09-23 21:59:07 +02:00
|
|
|
/// It's a bit hacky, and basically works by checking all of the years, then months, and then all of the files.
|
|
|
|
/// This is done as a way to avoid recursion, since async rust really hates recursive functions.
|
2024-09-25 23:54:55 +02:00
|
|
|
async fn scan(extention: &str, include_full: bool) -> eyre::Result<Vec<String>> {
|
|
|
|
let extention = &format!(".{}", extention);
|
|
|
|
|
2024-09-23 21:59:07 +02:00
|
|
|
let items = parse("").await?;
|
2024-09-25 18:11:42 +02:00
|
|
|
|
|
|
|
let years: Vec<u32> = items
|
|
|
|
.iter()
|
|
|
|
.filter_map(|x| {
|
|
|
|
let year = x.strip_suffix("/")?;
|
|
|
|
year.parse().ok()
|
|
|
|
})
|
|
|
|
.collect();
|
2024-09-23 21:59:07 +02:00
|
|
|
|
|
|
|
// A little bit of async to run all of the months concurrently.
|
|
|
|
let mut futures = FuturesUnordered::new();
|
2024-09-25 18:11:42 +02:00
|
|
|
|
2024-09-23 21:59:07 +02:00
|
|
|
for year in years {
|
|
|
|
let months = parse(&year.to_string()).await?;
|
2024-09-25 18:11:42 +02:00
|
|
|
|
2024-09-23 21:59:07 +02:00
|
|
|
for month in months {
|
|
|
|
futures.push(async move {
|
|
|
|
let path = format!("{}/{}", year, month);
|
|
|
|
|
|
|
|
let items = parse(&path).await.unwrap();
|
2024-09-27 14:21:49 +02:00
|
|
|
items
|
2024-09-25 18:11:42 +02:00
|
|
|
.into_iter()
|
|
|
|
.filter_map(|x| {
|
2024-09-25 23:54:55 +02:00
|
|
|
if x.ends_with(extention) {
|
|
|
|
if include_full {
|
|
|
|
Some(format!("{BASE_URL}{path}{x}"))
|
|
|
|
} else {
|
|
|
|
Some(format!("{path}{x}"))
|
|
|
|
}
|
2024-09-25 18:11:42 +02:00
|
|
|
} else {
|
|
|
|
None
|
|
|
|
}
|
|
|
|
})
|
2024-09-27 14:21:49 +02:00
|
|
|
.collect::<Vec<String>>()
|
2024-09-23 21:59:07 +02:00
|
|
|
});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
let mut files = Vec::new();
|
|
|
|
while let Some(mut result) = futures.next().await {
|
|
|
|
files.append(&mut result);
|
|
|
|
}
|
|
|
|
|
|
|
|
eyre::Result::Ok(files)
|
|
|
|
}
|
|
|
|
|
2024-09-25 23:54:55 +02:00
|
|
|
pub async fn scrape(extention: String, include_full: bool) -> eyre::Result<()> {
|
|
|
|
let files = scan(&extention, include_full).await?;
|
2024-09-23 21:59:07 +02:00
|
|
|
for file in files {
|
|
|
|
println!("{}", file);
|
|
|
|
}
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|