mirror of
https://github.com/talwat/lowfi
synced 2025-08-17 15:12:37 +00:00
feat: finish chillhop scraper
This commit is contained in:
parent
3f55768754
commit
91bb61dd92
7
.vscode/launch.json
vendored
Normal file
7
.vscode/launch.json
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
{
|
||||
// Use IntelliSense to learn about possible attributes.
|
||||
// Hover to view descriptions of existing attributes.
|
||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": []
|
||||
}
|
30
Cargo.lock
generated
30
Cargo.lock
generated
@ -1016,6 +1016,15 @@ version = "0.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
|
||||
|
||||
[[package]]
|
||||
name = "html-escape"
|
||||
version = "0.2.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6d1ad449764d627e22bfd7cd5e8868264fc9236e07c752972b4080cd351cb476"
|
||||
dependencies = [
|
||||
"utf8-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "html5ever"
|
||||
version = "0.29.0"
|
||||
@ -1410,6 +1419,7 @@ dependencies = [
|
||||
"dirs",
|
||||
"eyre",
|
||||
"futures",
|
||||
"html-escape",
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"mpris-server",
|
||||
@ -1417,6 +1427,8 @@ dependencies = [
|
||||
"reqwest",
|
||||
"rodio",
|
||||
"scraper",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"thiserror 2.0.12",
|
||||
"tokio",
|
||||
"unicode-segmentation",
|
||||
@ -2293,18 +2305,18 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.217"
|
||||
version = "1.0.219"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70"
|
||||
checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6"
|
||||
dependencies = [
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_derive"
|
||||
version = "1.0.217"
|
||||
version = "1.0.219"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0"
|
||||
checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@ -2313,9 +2325,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "serde_json"
|
||||
version = "1.0.135"
|
||||
version = "1.0.142"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2b0d7ba2887406110130a978386c4e1befb98c674b4fba677954e4db976630d9"
|
||||
checksum = "030fedb782600dcbd6f02d479bf0d817ac3bb40d644745b769d6a96bc3afc5a7"
|
||||
dependencies = [
|
||||
"itoa",
|
||||
"memchr",
|
||||
@ -2985,6 +2997,12 @@ version = "1.0.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
|
||||
|
||||
[[package]]
|
||||
name = "utf8-width"
|
||||
version = "0.1.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "86bd8d4e895da8537e5315b8254664e6b769c4ff3db18321b297a1e7004392e3"
|
||||
|
||||
[[package]]
|
||||
name = "utf8_iter"
|
||||
version = "1.0.4"
|
||||
|
@ -54,3 +54,6 @@ lazy_static = "1.5.0"
|
||||
libc = "0.2.167"
|
||||
url = "2.5.4"
|
||||
unicode-segmentation = "1.12.0"
|
||||
serde = { version = "1.0.219", features = ["derive"] }
|
||||
serde_json = "1.0.142"
|
||||
html-escape = "0.2.13"
|
||||
|
4
scripts/fix_cache.sh
Normal file
4
scripts/fix_cache.sh
Normal file
@ -0,0 +1,4 @@
|
||||
#!/bin/sh
|
||||
|
||||
grep -rlZ "429 Too Many Requests" . | xargs -0 rm -f
|
||||
find . -type f -empty -delete
|
@ -104,7 +104,7 @@ async fn main() -> eyre::Result<()> {
|
||||
include_full,
|
||||
} => match source {
|
||||
Sources::Lofigirl => scrapers::lofigirl::scrape(extension, include_full).await?,
|
||||
Sources::Chillhop => scrapers::chillhop::scrape().await,
|
||||
Sources::Chillhop => scrapers::chillhop::scrape().await?,
|
||||
},
|
||||
}
|
||||
} else {
|
||||
|
@ -1,53 +1,177 @@
|
||||
use eyre::{bail, eyre};
|
||||
use futures::{stream::FuturesOrdered, StreamExt};
|
||||
use lazy_static::lazy_static;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use reqwest::Client;
|
||||
use scraper::{Html, Selector};
|
||||
use serde::Deserialize;
|
||||
use tokio::{
|
||||
fs::{self, File},
|
||||
io::AsyncWriteExt,
|
||||
};
|
||||
|
||||
struct Release {
|
||||
pub tracks: Vec<String>,
|
||||
pub author: String,
|
||||
lazy_static! {
|
||||
static ref RELEASES: Selector = Selector::parse(".table-body > a").unwrap();
|
||||
static ref RELEASE_LABEL: Selector = Selector::parse("label").unwrap();
|
||||
// static ref RELEASE_DATE: Selector = Selector::parse(".release-feat-props > .text-xs").unwrap();
|
||||
// static ref RELEASE_NAME: Selector = Selector::parse(".release-feat-props > h2").unwrap();
|
||||
static ref RELEASE_AUTHOR: Selector = Selector::parse(".release-feat-props .artist-link").unwrap();
|
||||
static ref RELEASE_TEXTAREA: Selector = Selector::parse("textarea").unwrap();
|
||||
}
|
||||
|
||||
struct Data {
|
||||
pub releases: Vec<Release>,
|
||||
#[derive(Deserialize, Debug)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct Track {
|
||||
title: String,
|
||||
file_id: String,
|
||||
artists: String,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Debug)]
|
||||
struct Release {
|
||||
#[serde(skip)]
|
||||
pub path: String,
|
||||
#[serde(skip)]
|
||||
pub name: String,
|
||||
pub tracks: Vec<Track>,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
enum ReleaseError {
|
||||
#[error("invalid track: {0}")]
|
||||
Invalid(#[from] eyre::Error),
|
||||
|
||||
#[error("track explicitly ignored")]
|
||||
Ignored,
|
||||
}
|
||||
|
||||
impl Release {
|
||||
pub async fn scan(path: String, client: Client) -> Result<Self, ReleaseError> {
|
||||
let content = get(&client, &path).await?;
|
||||
let html = Html::parse_document(&content);
|
||||
|
||||
let textarea = html
|
||||
.select(&RELEASE_TEXTAREA)
|
||||
.next()
|
||||
.ok_or(eyre!("unable to find textarea: {path}"))?;
|
||||
let mut release: Self = serde_json::from_str(&textarea.inner_html()).unwrap();
|
||||
release.tracks.reverse();
|
||||
|
||||
let author = html
|
||||
.select(&RELEASE_AUTHOR)
|
||||
.next()
|
||||
.ok_or(eyre!("unable to find author: {path}"))?;
|
||||
if author.inner_html() == "Kenji" {
|
||||
return Err(ReleaseError::Ignored);
|
||||
}
|
||||
|
||||
Ok(release)
|
||||
}
|
||||
}
|
||||
|
||||
/// Sends a get request, with caching.
|
||||
async fn get(client: &Client, path: &str) -> String {
|
||||
let cache = PathBuf::from(format!("./cache/chillhop/{path}.html"));
|
||||
async fn get(client: &Client, path: &str) -> eyre::Result<String> {
|
||||
let trimmed = path.trim_matches('/');
|
||||
let cache = PathBuf::from(format!("./cache/chillhop/{trimmed}.html"));
|
||||
|
||||
if let Ok(x) = fs::read_to_string(&cache).await {
|
||||
x
|
||||
Ok(x)
|
||||
} else {
|
||||
let resp = client
|
||||
.get(format!("https://chillhop.com/{path}"))
|
||||
.get(format!("https://chillhop.com/{trimmed}"))
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
let text = resp.text().await.unwrap();
|
||||
.await?;
|
||||
|
||||
let status = resp.status();
|
||||
|
||||
if status == 429 {
|
||||
bail!("rate limit reached: {path}");
|
||||
}
|
||||
|
||||
if status != 404 && !status.is_success() && !status.is_redirection() {
|
||||
bail!("non success code {}: {path}", resp.status().as_u16());
|
||||
}
|
||||
|
||||
let text = resp.text().await?;
|
||||
|
||||
let parent = cache.parent();
|
||||
if let Some(x) = parent {
|
||||
if x != Path::new("") {
|
||||
fs::create_dir_all(x).await.unwrap();
|
||||
fs::create_dir_all(x).await?;
|
||||
}
|
||||
}
|
||||
|
||||
let mut file = File::create(&cache).await.unwrap();
|
||||
file.write_all(text.as_bytes()).await.unwrap();
|
||||
let mut file = File::create(&cache).await?;
|
||||
file.write_all(text.as_bytes()).await?;
|
||||
|
||||
text
|
||||
if status.is_redirection() {
|
||||
bail!("redirect: {path}")
|
||||
}
|
||||
|
||||
if status == 404 {
|
||||
bail!("not found: {path}")
|
||||
}
|
||||
|
||||
Ok(text)
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn scrape() {
|
||||
async fn scan_page(
|
||||
number: usize,
|
||||
client: &Client,
|
||||
) -> eyre::Result<Vec<impl futures::Future<Output = Result<Release, ReleaseError>>>> {
|
||||
let path = format!("releases/?page={number}");
|
||||
let content = get(client, &path).await?;
|
||||
let html = Html::parse_document(&content);
|
||||
|
||||
let elements = html.select(&RELEASES);
|
||||
Ok(elements
|
||||
.filter_map(|x| {
|
||||
let label = x.select(&RELEASE_LABEL).next()?.inner_html();
|
||||
if label == "Compilation" || label == "Mix" {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(Release::scan(x.attr("href")?.to_string(), client.clone()))
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
|
||||
pub async fn scrape() -> eyre::Result<()> {
|
||||
const PAGE_COUNT: usize = 40;
|
||||
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36";
|
||||
|
||||
fs::create_dir_all("./cache/chillhop").await.unwrap();
|
||||
let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
|
||||
|
||||
get(&client, "releases/?page=30").await;
|
||||
let mut futures = FuturesOrdered::new();
|
||||
|
||||
// This is slightly less memory efficient than I'd hope, but it is what it is.
|
||||
for page in 0..=PAGE_COUNT {
|
||||
for x in scan_page(page, &client).await? {
|
||||
futures.push_front(x);
|
||||
}
|
||||
}
|
||||
|
||||
while let Some(result) = futures.next().await {
|
||||
let release = match result {
|
||||
Ok(release) => release,
|
||||
Err(error) => {
|
||||
eprintln!("error: {}, skipping", error);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
for track in release.tracks {
|
||||
let title = html_escape::decode_html_entities(&track.title);
|
||||
let artist = html_escape::decode_html_entities(
|
||||
track.artists.split(", ").next().unwrap_or(&track.artists),
|
||||
);
|
||||
|
||||
println!("{}!{artist} - {title}", track.file_id)
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user