mirror of
https://github.com/talwat/lowfi
synced 2025-08-18 15:43:01 +00:00
feat: finish chillhop scraper
This commit is contained in:
parent
3f55768754
commit
91bb61dd92
7
.vscode/launch.json
vendored
Normal file
7
.vscode/launch.json
vendored
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
{
|
||||||
|
// Use IntelliSense to learn about possible attributes.
|
||||||
|
// Hover to view descriptions of existing attributes.
|
||||||
|
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||||
|
"version": "0.2.0",
|
||||||
|
"configurations": []
|
||||||
|
}
|
30
Cargo.lock
generated
30
Cargo.lock
generated
@ -1016,6 +1016,15 @@ version = "0.4.3"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
|
checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "html-escape"
|
||||||
|
version = "0.2.13"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6d1ad449764d627e22bfd7cd5e8868264fc9236e07c752972b4080cd351cb476"
|
||||||
|
dependencies = [
|
||||||
|
"utf8-width",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "html5ever"
|
name = "html5ever"
|
||||||
version = "0.29.0"
|
version = "0.29.0"
|
||||||
@ -1410,6 +1419,7 @@ dependencies = [
|
|||||||
"dirs",
|
"dirs",
|
||||||
"eyre",
|
"eyre",
|
||||||
"futures",
|
"futures",
|
||||||
|
"html-escape",
|
||||||
"lazy_static",
|
"lazy_static",
|
||||||
"libc",
|
"libc",
|
||||||
"mpris-server",
|
"mpris-server",
|
||||||
@ -1417,6 +1427,8 @@ dependencies = [
|
|||||||
"reqwest",
|
"reqwest",
|
||||||
"rodio",
|
"rodio",
|
||||||
"scraper",
|
"scraper",
|
||||||
|
"serde",
|
||||||
|
"serde_json",
|
||||||
"thiserror 2.0.12",
|
"thiserror 2.0.12",
|
||||||
"tokio",
|
"tokio",
|
||||||
"unicode-segmentation",
|
"unicode-segmentation",
|
||||||
@ -2293,18 +2305,18 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde"
|
name = "serde"
|
||||||
version = "1.0.217"
|
version = "1.0.219"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70"
|
checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"serde_derive",
|
"serde_derive",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde_derive"
|
name = "serde_derive"
|
||||||
version = "1.0.217"
|
version = "1.0.219"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0"
|
checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
@ -2313,9 +2325,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde_json"
|
name = "serde_json"
|
||||||
version = "1.0.135"
|
version = "1.0.142"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "2b0d7ba2887406110130a978386c4e1befb98c674b4fba677954e4db976630d9"
|
checksum = "030fedb782600dcbd6f02d479bf0d817ac3bb40d644745b769d6a96bc3afc5a7"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"itoa",
|
"itoa",
|
||||||
"memchr",
|
"memchr",
|
||||||
@ -2985,6 +2997,12 @@ version = "1.0.5"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
|
checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "utf8-width"
|
||||||
|
version = "0.1.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "86bd8d4e895da8537e5315b8254664e6b769c4ff3db18321b297a1e7004392e3"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "utf8_iter"
|
name = "utf8_iter"
|
||||||
version = "1.0.4"
|
version = "1.0.4"
|
||||||
|
@ -54,3 +54,6 @@ lazy_static = "1.5.0"
|
|||||||
libc = "0.2.167"
|
libc = "0.2.167"
|
||||||
url = "2.5.4"
|
url = "2.5.4"
|
||||||
unicode-segmentation = "1.12.0"
|
unicode-segmentation = "1.12.0"
|
||||||
|
serde = { version = "1.0.219", features = ["derive"] }
|
||||||
|
serde_json = "1.0.142"
|
||||||
|
html-escape = "0.2.13"
|
||||||
|
4
scripts/fix_cache.sh
Normal file
4
scripts/fix_cache.sh
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
grep -rlZ "429 Too Many Requests" . | xargs -0 rm -f
|
||||||
|
find . -type f -empty -delete
|
@ -104,7 +104,7 @@ async fn main() -> eyre::Result<()> {
|
|||||||
include_full,
|
include_full,
|
||||||
} => match source {
|
} => match source {
|
||||||
Sources::Lofigirl => scrapers::lofigirl::scrape(extension, include_full).await?,
|
Sources::Lofigirl => scrapers::lofigirl::scrape(extension, include_full).await?,
|
||||||
Sources::Chillhop => scrapers::chillhop::scrape().await,
|
Sources::Chillhop => scrapers::chillhop::scrape().await?,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -1,53 +1,177 @@
|
|||||||
|
use eyre::{bail, eyre};
|
||||||
|
use futures::{stream::FuturesOrdered, StreamExt};
|
||||||
|
use lazy_static::lazy_static;
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
use reqwest::Client;
|
use reqwest::Client;
|
||||||
|
use scraper::{Html, Selector};
|
||||||
|
use serde::Deserialize;
|
||||||
use tokio::{
|
use tokio::{
|
||||||
fs::{self, File},
|
fs::{self, File},
|
||||||
io::AsyncWriteExt,
|
io::AsyncWriteExt,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Release {
|
lazy_static! {
|
||||||
pub tracks: Vec<String>,
|
static ref RELEASES: Selector = Selector::parse(".table-body > a").unwrap();
|
||||||
pub author: String,
|
static ref RELEASE_LABEL: Selector = Selector::parse("label").unwrap();
|
||||||
|
// static ref RELEASE_DATE: Selector = Selector::parse(".release-feat-props > .text-xs").unwrap();
|
||||||
|
// static ref RELEASE_NAME: Selector = Selector::parse(".release-feat-props > h2").unwrap();
|
||||||
|
static ref RELEASE_AUTHOR: Selector = Selector::parse(".release-feat-props .artist-link").unwrap();
|
||||||
|
static ref RELEASE_TEXTAREA: Selector = Selector::parse("textarea").unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
struct Data {
|
#[derive(Deserialize, Debug)]
|
||||||
pub releases: Vec<Release>,
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct Track {
|
||||||
|
title: String,
|
||||||
|
file_id: String,
|
||||||
|
artists: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize, Debug)]
|
||||||
|
struct Release {
|
||||||
|
#[serde(skip)]
|
||||||
|
pub path: String,
|
||||||
|
#[serde(skip)]
|
||||||
|
pub name: String,
|
||||||
|
pub tracks: Vec<Track>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(thiserror::Error, Debug)]
|
||||||
|
enum ReleaseError {
|
||||||
|
#[error("invalid track: {0}")]
|
||||||
|
Invalid(#[from] eyre::Error),
|
||||||
|
|
||||||
|
#[error("track explicitly ignored")]
|
||||||
|
Ignored,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Release {
|
||||||
|
pub async fn scan(path: String, client: Client) -> Result<Self, ReleaseError> {
|
||||||
|
let content = get(&client, &path).await?;
|
||||||
|
let html = Html::parse_document(&content);
|
||||||
|
|
||||||
|
let textarea = html
|
||||||
|
.select(&RELEASE_TEXTAREA)
|
||||||
|
.next()
|
||||||
|
.ok_or(eyre!("unable to find textarea: {path}"))?;
|
||||||
|
let mut release: Self = serde_json::from_str(&textarea.inner_html()).unwrap();
|
||||||
|
release.tracks.reverse();
|
||||||
|
|
||||||
|
let author = html
|
||||||
|
.select(&RELEASE_AUTHOR)
|
||||||
|
.next()
|
||||||
|
.ok_or(eyre!("unable to find author: {path}"))?;
|
||||||
|
if author.inner_html() == "Kenji" {
|
||||||
|
return Err(ReleaseError::Ignored);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(release)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Sends a get request, with caching.
|
/// Sends a get request, with caching.
|
||||||
async fn get(client: &Client, path: &str) -> String {
|
async fn get(client: &Client, path: &str) -> eyre::Result<String> {
|
||||||
let cache = PathBuf::from(format!("./cache/chillhop/{path}.html"));
|
let trimmed = path.trim_matches('/');
|
||||||
|
let cache = PathBuf::from(format!("./cache/chillhop/{trimmed}.html"));
|
||||||
|
|
||||||
if let Ok(x) = fs::read_to_string(&cache).await {
|
if let Ok(x) = fs::read_to_string(&cache).await {
|
||||||
x
|
Ok(x)
|
||||||
} else {
|
} else {
|
||||||
let resp = client
|
let resp = client
|
||||||
.get(format!("https://chillhop.com/{path}"))
|
.get(format!("https://chillhop.com/{trimmed}"))
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await?;
|
||||||
.unwrap();
|
|
||||||
let text = resp.text().await.unwrap();
|
let status = resp.status();
|
||||||
|
|
||||||
|
if status == 429 {
|
||||||
|
bail!("rate limit reached: {path}");
|
||||||
|
}
|
||||||
|
|
||||||
|
if status != 404 && !status.is_success() && !status.is_redirection() {
|
||||||
|
bail!("non success code {}: {path}", resp.status().as_u16());
|
||||||
|
}
|
||||||
|
|
||||||
|
let text = resp.text().await?;
|
||||||
|
|
||||||
let parent = cache.parent();
|
let parent = cache.parent();
|
||||||
if let Some(x) = parent {
|
if let Some(x) = parent {
|
||||||
if x != Path::new("") {
|
if x != Path::new("") {
|
||||||
fs::create_dir_all(x).await.unwrap();
|
fs::create_dir_all(x).await?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut file = File::create(&cache).await.unwrap();
|
let mut file = File::create(&cache).await?;
|
||||||
file.write_all(text.as_bytes()).await.unwrap();
|
file.write_all(text.as_bytes()).await?;
|
||||||
|
|
||||||
text
|
if status.is_redirection() {
|
||||||
|
bail!("redirect: {path}")
|
||||||
|
}
|
||||||
|
|
||||||
|
if status == 404 {
|
||||||
|
bail!("not found: {path}")
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(text)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn scrape() {
|
async fn scan_page(
|
||||||
|
number: usize,
|
||||||
|
client: &Client,
|
||||||
|
) -> eyre::Result<Vec<impl futures::Future<Output = Result<Release, ReleaseError>>>> {
|
||||||
|
let path = format!("releases/?page={number}");
|
||||||
|
let content = get(client, &path).await?;
|
||||||
|
let html = Html::parse_document(&content);
|
||||||
|
|
||||||
|
let elements = html.select(&RELEASES);
|
||||||
|
Ok(elements
|
||||||
|
.filter_map(|x| {
|
||||||
|
let label = x.select(&RELEASE_LABEL).next()?.inner_html();
|
||||||
|
if label == "Compilation" || label == "Mix" {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
Some(Release::scan(x.attr("href")?.to_string(), client.clone()))
|
||||||
|
})
|
||||||
|
.collect())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn scrape() -> eyre::Result<()> {
|
||||||
const PAGE_COUNT: usize = 40;
|
const PAGE_COUNT: usize = 40;
|
||||||
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36";
|
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36";
|
||||||
|
|
||||||
fs::create_dir_all("./cache/chillhop").await.unwrap();
|
fs::create_dir_all("./cache/chillhop").await.unwrap();
|
||||||
let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
|
let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
|
||||||
|
|
||||||
get(&client, "releases/?page=30").await;
|
let mut futures = FuturesOrdered::new();
|
||||||
|
|
||||||
|
// This is slightly less memory efficient than I'd hope, but it is what it is.
|
||||||
|
for page in 0..=PAGE_COUNT {
|
||||||
|
for x in scan_page(page, &client).await? {
|
||||||
|
futures.push_front(x);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
while let Some(result) = futures.next().await {
|
||||||
|
let release = match result {
|
||||||
|
Ok(release) => release,
|
||||||
|
Err(error) => {
|
||||||
|
eprintln!("error: {}, skipping", error);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
for track in release.tracks {
|
||||||
|
let title = html_escape::decode_html_entities(&track.title);
|
||||||
|
let artist = html_escape::decode_html_entities(
|
||||||
|
track.artists.split(", ").next().unwrap_or(&track.artists),
|
||||||
|
);
|
||||||
|
|
||||||
|
println!("{}!{artist} - {title}", track.file_id)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user