updated script

parse html instead of just regex for complete file names
This commit is contained in:
2025-10-11 00:20:38 -05:00
parent 6c9b6c9ec1
commit 7e842d5db6
2 changed files with 719 additions and 28 deletions

182
main.ts
View File

@@ -1,13 +1,14 @@
import { existsSync } from 'node:fs'
import { mkdir, writeFile } from 'node:fs/promises'
import path from 'node:path'
import {argv} from 'node:process'
import { argv } from 'node:process'
import { Buffer } from "node:buffer"
import * as cheerio from "https://esm.sh/cheerio?target=esnext"
const REGEX_PLAYLISTURL = /^https:\/\/downloads\.khinsider\.com\/game-soundtracks\/album\/[^\/]+\/?$/i
const REGEX_SONGURL = /^https:\/\/downloads\.khinsider\.com\/game-soundtracks\/album\/[^\/]+\/[^\/]+\/?$/i
const REGEX_FILEPATHPARSE = /^\/soundtracks\/([^\/]+)\/[^\/]+\/([^\/]+)$/i
const REGEX_PLAYLIST_SONGHREF = /<td class="playlistDownloadSong"><a href="(.*?)">/gi
const REGEX_SONG_FILEHREF = /<a\s+href=['"](.*?\.flac)['"]><span\s+class=['"]songDownloadLink/i
const REGEX_ALBUMTITLE = /^(.*?) MP3 - Download/i
const REGEX_UNSAFEFORFILE = /[^a-z0-9\-_=+,.()\[\]{} ]/gi
async function main() {
const playlistURL = argv[2]
@@ -25,23 +26,129 @@ async function main() {
}
}
async function fetchPlaylist(url: string): Promise<string[]> {
async function fetchPlaylist(url: string): Promise<PlaylistSongData[]> {
if (!REGEX_PLAYLISTURL.test(url)) {
throw `unaccepted url ${url}`
}
const resp = await fetch(url)
const text = await resp.text()
const req = await fetch(url)
const text = await req.text()
const matches = text.matchAll(REGEX_PLAYLIST_SONGHREF)
const matchesArr = [...matches].map(match => match[1])
const $ = cheerio.load(text)
console.log(`downloading ${matchesArr.length} songs`)
const title = $.extract({
title: 'title'
}).title ?? ''
const titleMatch = REGEX_ALBUMTITLE.exec(title)
if (titleMatch === null) {
throw `unable to grab album name from ${title}`
}
const albumName = titleMatch[1]
const columns = getPlaylistTableHeaders($)
const rows = getPlaylistRows($, columns, albumName)
console.log(`downloading ${rows.length} songs`)
return matchesArr
return rows
}
async function downloadSong(url: string, location: string) {
interface PlaylistTableColumns {
name: number
track: number
cd: number
}
interface PlaylistSongData {
url: string
album: string
name: string
track: number
cd: number
}
function getPlaylistTableHeaders($: cheerio.CheerioAPI): PlaylistTableColumns {
const header = $('#songlist tr#songlist_header')
const headerCells = header.extract({
cells: ['th']
})
const indexes = headerCells.cells.reduce((p, c, i) => {
switch (c.toLocaleLowerCase()) {
case 'cd':
p.cd = i
break
case '#':
p.track = i
break
case 'song name':
p.name = i
}
return p
}, {
name: -1,
track: -1,
cd: -1,
})
if (indexes.name == -1) {
throw 'unable to find song title column'
}
return indexes
}
function getPlaylistRows($: cheerio.CheerioAPI, columns: PlaylistTableColumns, albumName: string): PlaylistSongData[] {
const rows = $('#songlist tr:not(#songlist_header):not(#songlist_footer)')
const rowsData: PlaylistSongData[] = []
rows.each((_, rowEl) => {
const row = cheerio.load(rowEl)
const rowData: PlaylistSongData = {
url: '',
album: albumName,
name: '',
track: -1,
cd: -1,
}
rowData.url = row.extract({
url: {
selector: 'td.playlistDownloadSong a',
value: 'href'
}
}).url ?? ''
const rowCells = row.extract({
cells: ['td']
}).cells
rowData.name = rowCells[columns.name]
if (!rowData.name) {
throw `unable to grab song name from ${albumName} in row ${columns.name} - ${rowCells}`
}
if (columns.track >= 0) {
rowData.track = parseInt(rowCells[columns.track] ?? '-1')
if (isNaN(rowData.track)) {
rowData.track = -1
}
}
if (columns.cd >= 0) {
rowData.cd = parseInt(rowCells[columns.cd] ?? '-1')
if (isNaN(rowData.cd)) {
rowData.cd = -1
}
}
rowsData.push(rowData)
})
return rowsData
}
async function downloadSong(song: PlaylistSongData, location: string) {
let url = song.url
if (!/^http/i.test(url)) {
url = 'https://downloads.khinsider.com' + url
}
@@ -49,24 +156,22 @@ async function downloadSong(url: string, location: string) {
throw `unaccepted url ${url}`
}
let resp = await fetch(url)
let text = await resp.text()
let match = text.match(REGEX_SONG_FILEHREF)
if (!match) {
const resp = await fetch(url)
const text = await resp.text()
const $ = cheerio.load(text)
const flacUrl = $.extract({
url: {
selector: '#pageContent a[href*="flac"]',
value: 'href'
}
}).url
if (!flacUrl) {
throw `can't find download link for ${url}`
}
const songurl = new URL(match[1])
const songurlmatch = REGEX_FILEPATHPARSE.exec(songurl.pathname)
if (!songurlmatch) {
throw `can't find folder and filename for ${songurl}`
}
const foldername = decodeURIComponent(songurlmatch[1])
const filename = decodeURIComponent(songurlmatch[2])
const pathname = path.resolve(location, foldername)
const fullpathname = path.resolve(pathname, filename)
const { pathname, fullpathname } = pathFor(location, song)
if (!existsSync(pathname)) {
await mkdir(pathname)
@@ -76,13 +181,34 @@ async function downloadSong(url: string, location: string) {
}
console.log(`downloading ${fullpathname}`)
console.log(` from ${flacUrl}`)
const songresp = await fetch(songurl)
const songresp = await fetch(flacUrl)
const songblob = await songresp.arrayBuffer()
return writeFile(fullpathname, toBuffer(songblob))
}
interface SongPath {
pathname: string
fullpathname: string
}
function pathFor(location: string, song: PlaylistSongData): SongPath {
const albumname = song.album.replace(REGEX_UNSAFEFORFILE, '')
const songname = song.name.replace(REGEX_UNSAFEFORFILE, '')
const filename = `${song.cd >= 0 ? song.cd+'.' : ''}${song.track >= 0 ? song.track+'.' : ''}${song.cd >= 0 || song.track >= 0 ? ' ' : ''}${songname}.flac`
const pathname = path.resolve(location, albumname)
const fullpathname = path.resolve(pathname, filename)
return {
pathname,
fullpathname
}
}
function toBuffer(arrayBuffer: ArrayBuffer) {
const buffer = Buffer.alloc(arrayBuffer.byteLength);
const view = new Uint8Array(arrayBuffer);