updated script

parse html instead of just regex for complete file names
2025-10-11 00:20:38 -05:00
parent 6c9b6c9ec1
commit 7e842d5db6
2 changed files with 719 additions and 28 deletions
--- a/main.ts
+++ b/main.ts
@@ -1,13 +1,14 @@
 import { existsSync } from 'node:fs'
 import { mkdir, writeFile } from 'node:fs/promises'
 import path from 'node:path'
-import {argv} from 'node:process'
+import { argv } from 'node:process'
+import { Buffer } from "node:buffer"
+import * as cheerio from "https://esm.sh/cheerio?target=esnext"

 const REGEX_PLAYLISTURL = /^https:\/\/downloads\.khinsider\.com\/game-soundtracks\/album\/[^\/]+\/?$/i
 const REGEX_SONGURL = /^https:\/\/downloads\.khinsider\.com\/game-soundtracks\/album\/[^\/]+\/[^\/]+\/?$/i
-const REGEX_FILEPATHPARSE = /^\/soundtracks\/([^\/]+)\/[^\/]+\/([^\/]+)$/i
-const REGEX_PLAYLIST_SONGHREF = /<td class="playlistDownloadSong"><a href="(.*?)">/gi
-const REGEX_SONG_FILEHREF = /<a\s+href=['"](.*?\.flac)['"]><span\s+class=['"]songDownloadLink/i
+const REGEX_ALBUMTITLE = /^(.*?) MP3 - Download/i
+const REGEX_UNSAFEFORFILE = /[^a-z0-9\-_=+,.()\[\]{} ]/gi

 async function main() {
  const playlistURL = argv[2]
@@ -25,23 +26,129 @@ async function main() {
  }
 }

-async function fetchPlaylist(url: string): Promise<string[]> {
+async function fetchPlaylist(url: string): Promise<PlaylistSongData[]> {
  if (!REGEX_PLAYLISTURL.test(url)) {
    throw `unaccepted url ${url}`
  }

-  const resp = await fetch(url)
-  const text = await resp.text()
+  const req = await fetch(url)
+  const text = await req.text()

-  const matches = text.matchAll(REGEX_PLAYLIST_SONGHREF)
-  const matchesArr = [...matches].map(match => match[1])
+  const $ = cheerio.load(text)

-  console.log(`downloading ${matchesArr.length} songs`)
+  const title = $.extract({
+    title: 'title'
+  }).title ?? ''
+  const titleMatch = REGEX_ALBUMTITLE.exec(title)
+  if (titleMatch === null) {
+    throw `unable to grab album name from ${title}`
+  }
+  const albumName = titleMatch[1]
+
+  const columns = getPlaylistTableHeaders($)
+  const rows = getPlaylistRows($, columns, albumName)
+
+  console.log(`downloading ${rows.length} songs`)
  
-  return matchesArr
+  return rows
 }

-async function downloadSong(url: string, location: string) {
+interface PlaylistTableColumns {
+  name: number
+  track: number
+  cd: number
+}
+interface PlaylistSongData {
+  url: string
+  album: string
+  name: string
+  track: number
+  cd: number
+}
+
+function getPlaylistTableHeaders($: cheerio.CheerioAPI): PlaylistTableColumns {
+  const header = $('#songlist tr#songlist_header')
+  const headerCells = header.extract({
+    cells: ['th']
+  })
+
+  const indexes =  headerCells.cells.reduce((p, c, i) => {
+    switch (c.toLocaleLowerCase()) {
+      case 'cd':
+        p.cd = i
+        break
+      case '#':
+        p.track = i
+        break
+      case 'song name':
+        p.name = i
+    }
+    return p
+  }, {
+    name: -1,
+    track: -1,
+    cd: -1,
+  })
+
+  if (indexes.name == -1) {
+    throw 'unable to find song title column'
+  }
+
+  return indexes
+}
+
+function getPlaylistRows($: cheerio.CheerioAPI, columns: PlaylistTableColumns, albumName: string): PlaylistSongData[] {
+  const rows = $('#songlist tr:not(#songlist_header):not(#songlist_footer)')
+  const rowsData: PlaylistSongData[] = []
+  
+  rows.each((_, rowEl) => {
+    const row = cheerio.load(rowEl)
+    const rowData: PlaylistSongData = {
+      url: '',
+      album: albumName,
+      name: '',
+      track: -1,
+      cd: -1,
+    }
+    
+    rowData.url = row.extract({
+      url: {
+        selector: 'td.playlistDownloadSong a',
+        value: 'href'
+      }
+    }).url ?? ''
+
+    const rowCells = row.extract({
+      cells: ['td']
+    }).cells
+
+    rowData.name = rowCells[columns.name]
+    if (!rowData.name) {
+      throw `unable to grab song name from ${albumName} in row ${columns.name} - ${rowCells}`
+    }
+
+    if (columns.track >= 0) {
+      rowData.track = parseInt(rowCells[columns.track] ?? '-1')
+      if (isNaN(rowData.track)) {
+        rowData.track = -1
+      }
+    }
+
+    if (columns.cd >= 0) {
+      rowData.cd = parseInt(rowCells[columns.cd] ?? '-1')
+      if (isNaN(rowData.cd)) {
+        rowData.cd = -1
+      }
+    }
+
+    rowsData.push(rowData)
+  })
+
+  return rowsData
+}
+
+async function downloadSong(song: PlaylistSongData, location: string) {
+  let url = song.url
  if (!/^http/i.test(url)) {
    url = 'https://downloads.khinsider.com' + url
  }
@@ -49,24 +156,22 @@ async function downloadSong(url: string, location: string) {
    throw `unaccepted url ${url}`
  }

-  let resp = await fetch(url)
-  let text = await resp.text()
-  let match = text.match(REGEX_SONG_FILEHREF)
-  if (!match) {
+  const resp = await fetch(url)
+  const text = await resp.text()
+
+  const $ = cheerio.load(text)
+  const flacUrl = $.extract({
+    url: {
+      selector: '#pageContent a[href*="flac"]',
+      value: 'href'
+    }
+  }).url
+
+  if (!flacUrl) {
    throw `can't find download link for ${url}`
  }

-  
-  const songurl = new URL(match[1])
-  const songurlmatch = REGEX_FILEPATHPARSE.exec(songurl.pathname)
-  if (!songurlmatch) {
-    throw `can't find folder and filename for ${songurl}`
-  }
-
-  const foldername = decodeURIComponent(songurlmatch[1])
-  const filename = decodeURIComponent(songurlmatch[2])
-  const pathname = path.resolve(location, foldername)
-  const fullpathname = path.resolve(pathname, filename)
+  const { pathname, fullpathname } = pathFor(location, song)
  
  if (!existsSync(pathname)) {
    await mkdir(pathname)
@@ -76,13 +181,34 @@ async function downloadSong(url: string, location: string) {
  }

  console.log(`downloading ${fullpathname}`)
+  console.log(`  from ${flacUrl}`)

-  const songresp = await fetch(songurl)
+  const songresp = await fetch(flacUrl)
  const songblob = await songresp.arrayBuffer()

  return writeFile(fullpathname, toBuffer(songblob))
 }

+interface SongPath {
+  pathname: string
+  fullpathname: string
+}
+
+function pathFor(location: string, song: PlaylistSongData): SongPath {
+  const albumname = song.album.replace(REGEX_UNSAFEFORFILE, '')
+  const songname = song.name.replace(REGEX_UNSAFEFORFILE, '')
+
+  const filename = `${song.cd >= 0 ? song.cd+'.' : ''}${song.track >= 0 ? song.track+'.' : ''}${song.cd >= 0 || song.track >= 0 ? ' ' : ''}${songname}.flac`
+
+  const pathname = path.resolve(location, albumname)
+  const fullpathname = path.resolve(pathname, filename)
+
+  return {
+    pathname,
+    fullpathname
+  }
+}
+
 function toBuffer(arrayBuffer: ArrayBuffer) {
  const buffer = Buffer.alloc(arrayBuffer.byteLength);
  const view = new Uint8Array(arrayBuffer);