feat: add archive scraper

feat: add more information to errors
This commit is contained in:
talwat 2025-08-09 23:38:00 +02:00
parent bdd508bfbb
commit 6f15f9226f
11 changed files with 2641 additions and 74 deletions

2459
data/lofigirl-new.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -71,14 +71,6 @@ enum Commands {
Scrape {
// The source to scrape from.
source: scrapers::Source,
/// The file extension to search for, defaults to mp3.
#[clap(long, short, default_value = "mp3")]
extension: String,
/// Whether to include the full HTTP URL or just the distinguishing part.
#[clap(long, short)]
include_full: bool,
},
}
@ -101,12 +93,9 @@ async fn main() -> eyre::Result<()> {
match command {
// TODO: Actually distinguish between sources.
#[cfg(feature = "scrape")]
Commands::Scrape {
source,
extension,
include_full,
} => match source {
Source::Lofigirl => scrapers::lofigirl::scrape(extension, include_full).await?,
Commands::Scrape { source } => match source {
Source::Archive => scrapers::archive::scrape().await?,
Source::Lofigirl => scrapers::lofigirl::scrape().await?,
Source::Chillhop => scrapers::chillhop::scrape().await?,
},
}

View File

@ -41,7 +41,8 @@ pub use error::Error;
pub mod mpris;
/// The time to wait in between errors.
const TIMEOUT: Duration = Duration::from_secs(3);
/// TODO: Make this configurable.
const TIMEOUT: Duration = Duration::from_secs(5);
/// Main struct responsible for queuing up & playing tracks.
// TODO: Consider refactoring [Player] from being stored in an [Arc], into containing many smaller [Arc]s.

View File

@ -49,7 +49,7 @@ impl Downloader {
Ok(track) => self.player.tracks.write().await.push_back(track),
Err(error) if !error.is_timeout() => {
if debug {
panic!("{}", error)
panic!("{error}")
}
sleep(TIMEOUT).await;

View File

@ -10,7 +10,7 @@ use crate::{
impl Player {
/// Fetches the next track from the queue, or a random track if the queue is empty.
/// This will also set the current track to the fetched track's info.
async fn fetch(&self) -> Result<tracks::DecodedTrack, tracks::TrackError> {
async fn fetch(&self) -> Result<tracks::DecodedTrack, tracks::Error> {
// TODO: Consider replacing this with `unwrap_or_else` when async closures are stablized.
let track = self.tracks.write().await.pop_front();
let track = if let Some(track) = track {
@ -66,7 +66,7 @@ impl Player {
Err(error) => {
if !error.is_timeout() {
if debug {
panic!("{error:?}")
panic!("{error}")
}
sleep(TIMEOUT).await;

View File

@ -8,12 +8,14 @@ use tokio::{
io::AsyncWriteExt,
};
pub mod archive;
pub mod chillhop;
pub mod lofigirl;
#[derive(Clone, Copy, PartialEq, Eq, Debug, ValueEnum)]
pub enum Source {
Lofigirl,
Archive,
Chillhop,
}
@ -21,6 +23,7 @@ impl Source {
pub fn cache_dir(&self) -> &'static str {
match self {
Source::Lofigirl => "lofigirl",
Source::Archive => "archive",
Source::Chillhop => "chillhop",
}
}
@ -28,6 +31,7 @@ impl Source {
pub fn url(&self) -> &'static str {
match self {
Source::Chillhop => "https://chillhop.com",
Source::Archive => "https://ia601004.us.archive.org/31/items/lofigirl",
Source::Lofigirl => "https://lofigirl.com/wp-content/uploads",
}
}

74
src/scrapers/archive.rs Normal file
View File

@ -0,0 +1,74 @@
//! Has all of the functions for the `scrape` command.
//!
//! This command is completely optional, and as such isn't subject to the same
//! quality standards as the rest of the codebase.
use futures::{stream::FuturesOrdered, StreamExt};
use lazy_static::lazy_static;
use reqwest::Client;
use scraper::{Html, Selector};
use crate::scrapers::{get, Source};
lazy_static! {
static ref SELECTOR: Selector = Selector::parse("html > body > pre > a").unwrap();
}
async fn parse(client: &Client, path: &str) -> eyre::Result<Vec<String>> {
let document = get(client, path, super::Source::Lofigirl).await?;
let html = Html::parse_document(&document);
Ok(html
.select(&SELECTOR)
.skip(1)
.map(|x| String::from(x.attr("href").unwrap()))
.collect())
}
/// This function basically just scans the entire file server, and returns a list of paths to mp3 files.
///
/// It's a bit hacky, and basically works by checking all of the years, then months, and then all of the files.
/// This is done as a way to avoid recursion, since async rust really hates recursive functions.
async fn scan() -> eyre::Result<Vec<String>> {
let client = Client::new();
let mut releases = parse(&client, "/").await?;
releases.truncate(releases.len() - 4);
// A little bit of async to run all of the months concurrently.
let mut futures = FuturesOrdered::new();
for release in releases {
let client = client.clone();
futures.push_back(async move {
let items = parse(&client, &release).await.unwrap();
items
.into_iter()
.filter_map(|x| {
if x.ends_with(".mp3") {
Some(format!("{release}{x}"))
} else {
None
}
})
.collect::<Vec<String>>()
});
}
let mut files = Vec::new();
while let Some(mut result) = futures.next().await {
files.append(&mut result);
}
eyre::Result::Ok(files)
}
pub async fn scrape() -> eyre::Result<()> {
println!("{}/", Source::Lofigirl.url());
let files = scan().await?;
for file in files {
println!("{file}");
}
Ok(())
}

View File

@ -8,7 +8,7 @@ use lazy_static::lazy_static;
use reqwest::Client;
use scraper::{Html, Selector};
use crate::scrapers::{get, Source};
use crate::scrapers::get;
lazy_static! {
static ref SELECTOR: Selector = Selector::parse("html > body > pre > a").unwrap();
@ -29,10 +29,8 @@ async fn parse(client: &Client, path: &str) -> eyre::Result<Vec<String>> {
///
/// It's a bit hacky, and basically works by checking all of the years, then months, and then all of the files.
/// This is done as a way to avoid recursion, since async rust really hates recursive functions.
async fn scan(extension: &str, include_full: bool) -> eyre::Result<Vec<String>> {
async fn scan() -> eyre::Result<Vec<String>> {
let client = Client::new();
let extension = &format!(".{}", extension);
let items = parse(&client, "/").await?;
let mut years: Vec<u32> = items
@ -60,12 +58,8 @@ async fn scan(extension: &str, include_full: bool) -> eyre::Result<Vec<String>>
items
.into_iter()
.filter_map(|x| {
if x.ends_with(extension) {
if include_full {
Some(format!("{}/{path}{x}", Source::Lofigirl.url()))
} else {
Some(format!("{path}{x}"))
}
if x.ends_with(".mp3") {
Some(format!("{path}{x}"))
} else {
None
}
@ -83,8 +77,8 @@ async fn scan(extension: &str, include_full: bool) -> eyre::Result<Vec<String>>
eyre::Result::Ok(files)
}
pub async fn scrape(extension: String, include_full: bool) -> eyre::Result<()> {
let files = scan(&extension, include_full).await?;
pub async fn scrape() -> eyre::Result<()> {
let files = scan().await?;
for file in files {
println!("{file}");
}

View File

@ -20,41 +20,15 @@ use std::{io::Cursor, path::Path, time::Duration};
use bytes::Bytes;
use inflector::Inflector as _;
use rodio::{Decoder, Source as _};
use thiserror::Error;
use tokio::io;
use unicode_segmentation::UnicodeSegmentation;
use url::form_urlencoded;
pub mod error;
pub mod list;
/// The error type for the track system, which is used to handle errors that occur
/// while downloading, decoding, or playing tracks.
#[derive(Debug, Error)]
pub enum TrackError {
#[error("timeout")]
Timeout,
pub use error::Error;
#[error("unable to decode")]
Decode(#[from] rodio::decoder::DecoderError),
#[error("invalid name")]
InvalidName,
#[error("invalid file path")]
InvalidPath,
#[error("unable to read file")]
File(#[from] io::Error),
#[error("unable to fetch data")]
Request(#[from] reqwest::Error),
}
impl TrackError {
pub const fn is_timeout(&self) -> bool {
matches!(self, Self::Timeout)
}
}
use crate::tracks::error::Context;
/// Just a shorthand for a decoded [Bytes].
pub type DecodedData = Decoder<Cursor<Bytes>>;
@ -92,7 +66,7 @@ impl QueuedTrack {
/// This will actually decode and format the track,
/// returning a [`DecodedTrack`] which can be played
/// and also has a duration & formatted name.
pub fn decode(self) -> eyre::Result<DecodedTrack, TrackError> {
pub fn decode(self) -> eyre::Result<DecodedTrack, Error> {
DecodedTrack::new(self)
}
}
@ -134,13 +108,13 @@ impl Info {
/// Formats a name with [Inflector].
/// This will also strip the first few numbers that are
/// usually present on most lofi tracks.
fn format_name(name: &str) -> eyre::Result<String, TrackError> {
fn format_name(name: &str) -> eyre::Result<String, Error> {
let path = Path::new(name);
let stem = path
.file_stem()
.and_then(|x| x.to_str())
.ok_or(TrackError::InvalidName)?;
.ok_or((name, error::Kind::InvalidName))?;
let formatted = Self::decode_url(stem)
.to_lowercase()
.to_title_case()
@ -181,7 +155,7 @@ impl Info {
name: TrackName,
full_path: String,
decoded: &DecodedData,
) -> eyre::Result<Self, TrackError> {
) -> eyre::Result<Self, Error> {
let (display_name, custom_name) = match name {
TrackName::Raw(raw) => (Self::format_name(&raw)?, false),
TrackName::Formatted(custom) => (custom, true),
@ -210,11 +184,12 @@ pub struct DecodedTrack {
impl DecodedTrack {
/// Creates a new track.
/// This is equivalent to [`QueuedTrack::decode`].
pub fn new(track: QueuedTrack) -> eyre::Result<Self, TrackError> {
pub fn new(track: QueuedTrack) -> eyre::Result<Self, Error> {
let data = Decoder::builder()
.with_byte_len(track.data.len().try_into().unwrap())
.with_data(Cursor::new(track.data))
.build()?;
.build()
.track(track.full_path.clone())?;
let info = Info::new(track.name, track.full_path, &data)?;

61
src/tracks/error.rs Normal file
View File

@ -0,0 +1,61 @@
#[derive(Debug, thiserror::Error)]
pub enum Kind {
#[error("timeout")]
Timeout,
#[error("unable to decode: {0}")]
Decode(#[from] rodio::decoder::DecoderError),
#[error("invalid name")]
InvalidName,
#[error("invalid file path")]
InvalidPath,
#[error("unable to read file: {0}")]
File(#[from] std::io::Error),
#[error("unable to fetch data: {0}")]
Request(#[from] reqwest::Error),
}
#[derive(Debug, thiserror::Error)]
#[error("{kind}\ntrack: {track}")]
pub struct Error {
pub track: String,
#[source]
pub kind: Kind,
}
impl Error {
pub const fn is_timeout(&self) -> bool {
matches!(self.kind, Kind::Timeout)
}
}
impl<T, E> From<(T, E)> for Error
where
T: Into<String>,
Kind: From<E>,
{
fn from((track, err): (T, E)) -> Self {
Error {
track: track.into(),
kind: Kind::from(err),
}
}
}
pub trait Context<T> {
fn track(self, name: impl Into<String>) -> Result<T, Error>;
}
impl<T, E> Context<T> for Result<T, E>
where
(String, E): Into<Error>,
{
fn track(self, name: impl Into<String>) -> Result<T, Error> {
self.map_err(|e| (name.into(), e).into())
}
}

View File

@ -7,7 +7,10 @@ use rand::Rng as _;
use reqwest::Client;
use tokio::fs;
use crate::{data_dir, tracks::TrackError};
use crate::{
data_dir,
tracks::{self, error::Context},
};
use super::QueuedTrack;
@ -52,7 +55,11 @@ impl List {
}
/// Downloads a raw track, but doesn't decode it.
async fn download(&self, track: &str, client: &Client) -> Result<(Bytes, String), TrackError> {
async fn download(
&self,
track: &str,
client: &Client,
) -> Result<(Bytes, String), tracks::Error> {
// If the track has a protocol, then we should ignore the base for it.
let full_path = if track.contains("://") {
track.to_owned()
@ -62,28 +69,31 @@ impl List {
let data: Bytes = if let Some(x) = full_path.strip_prefix("file://") {
let path = if x.starts_with('~') {
let home_path = dirs::home_dir().ok_or(TrackError::InvalidPath)?;
let home = home_path.to_str().ok_or(TrackError::InvalidPath)?;
let home_path =
dirs::home_dir().ok_or((track, tracks::error::Kind::InvalidPath))?;
let home = home_path
.to_str()
.ok_or((track, tracks::error::Kind::InvalidPath))?;
x.replace('~', home)
} else {
x.to_owned()
};
let result = tokio::fs::read(path).await?;
let result = tokio::fs::read(path.clone()).await.track(track)?;
result.into()
} else {
let response = match client.get(full_path.clone()).send().await {
Ok(x) => Ok(x),
Err(x) => {
if x.is_timeout() {
Err(TrackError::Timeout)
Err((track, tracks::error::Kind::Timeout))
} else {
Err(TrackError::Request(x))
Err((track, tracks::error::Kind::Request(x)))
}
}
}?;
response.bytes().await?
response.bytes().await.track(track)?
};
Ok((data, full_path))
@ -93,7 +103,7 @@ impl List {
///
/// The Result's error is a bool, which is true if a timeout error occured,
/// and false otherwise. This tells lowfi if it shouldn't wait to try again.
pub async fn random(&self, client: &Client) -> Result<QueuedTrack, TrackError> {
pub async fn random(&self, client: &Client) -> Result<QueuedTrack, tracks::Error> {
let (path, custom_name) = self.random_path();
let (data, full_path) = self.download(&path, client).await?;