X-Git-Url: https://git.jacobcasper.com/?p=Marketplaylister.git;a=blobdiff_plain;f=marketscraper.php;h=e4a14c40116578731e7073b917a7dad9b68ae683;hp=f7d56de9ca5c5944b296dc16778dc354749f9056;hb=HEAD;hpb=29d84c29eac4bafa7aa1370578bce9e8e0a8f2bc diff --git a/marketscraper.php b/marketscraper.php index f7d56de..e4a14c4 100644 --- a/marketscraper.php +++ b/marketscraper.php @@ -1,76 +1,61 @@ query('SELECT date FROM songs order by date desc limit 1'); - $resultset = $query->fetch(); +require 'mpfuncs.php'; - $lastEpDT = DateTime::createFromFormat(SQLITE_DATE_FORM, $resultset['date']); - $startDate = new DateTime; - $episodeDatePages = []; - $episodeTrackPages = []; - - while ($startDate > $lastEpDT) { - // DOM garbles UTF-8 chars, so loading them to HTML-ENTITIES data fixes this - $html = mb_convert_encoding(file_get_contents('https://www.marketplace.org/latest-music?page=' . $page), 'HTML-ENTITIES', "UTF-8"); - $DOM = new DOMDocument; - $DOM->loadHTML($html); - $headers = $DOM->getElementsByTagName('h2'); - $divs = $DOM->getElementsByTagName('div'); - - $episodeDatePages[] = parseEpisodeDate($headers, $lastEpDT); - $episodeTrackPages[] = parseEpisodePage($divs); - $startDate = end($episodeDatePages[$page - 1]); - $page++; - } - - //print_r($episodeDatePages); +set_time_limit(0); - - //Unroll episodeDatePages - $episodeDates = []; - foreach ($episodeDatePages as $episodeDatePage) { - foreach ($episodeDatePage as $episodeDate) { - $episodeDates[] = $episodeDate; - } - } - - - // Unroll episodeTrackPages - $episodeTrackLists = []; - foreach ($episodeTrackPages as $epTrackPage) { - foreach ($epTrackPage as $epTrackList) { - $episodeTrackLists[] = $epTrackList; - } +const SQLITE_DATE_FORM = 'Y-m-d H:i:s'; + +$pdo = new PDO("sqlite:mktplc.sqlite3"); + +$genDate = DateTime::createFromFormat(DATE_FORM, '1/1/2017'); + +$page = 1; + +$query = $pdo->query('SELECT date FROM songs order by date desc limit 1'); +$resultset = $query->fetch(); + +$lastEpDT = DateTime::createFromFormat(SQLITE_DATE_FORM, $resultset['date']); +$startDate = new DateTime; +$episodeDatePages = []; +$episodeTrackPages = []; + +$episodes = []; +while ($startDate > $lastEpDT) { + // DOM garbles UTF-8 chars, so loading them to HTML-ENTITIES data fixes this + $html = mb_convert_encoding(file_get_contents('https://www.marketplace.org/latest-music/marketplace/page/' . $page), 'HTML-ENTITIES', "UTF-8"); + $DOM = new DOMDocument; + $DOM->loadHTML($html); + $xpath = new DOMXPath($DOM); + $episodeData = $xpath->evaluate("//div[contains(@class, 'mp-music-card')]"); + foreach($episodeData as $episode) { + $children = iterator_to_array($episode->childNodes); + $episodeHeadCard = array_pop(findChildWithClass($children, 'mp-music-card-episode')); + $episodeMeta = array_pop(findChildWithClass($episodeHeadCard->childNodes, 'mp-music-card-meta')); + $episodeDate = array_pop(findChildWithClass($episodeMeta->childNodes, 'mp-music-card-meta_pubdate'))->textContent; + if (!isset($episodeDate)) { continue; } + $trackDiv = array_pop(findChildWithClass($children, 'mp-music-card-tracks')); + $trackItems = findChildWithClass($trackDiv->childNodes, 'flex w-full flex-wrap item'); + $trackIDs = []; + foreach($trackItems as $trackItem) { + $divs = findChildWithClass($trackItem->childNodes, 'w-full min-tablet:w-1/2'); + foreach ($divs as $div) { + $trackIDs[] = array_pop(explode('/', array_pop(findChildWithClass($div->childNodes, 'song-title'))->attributes->getNamedItem('href')->value)); + } + } + $episodes[$episodeDate] = $trackIDs; } - - $episodes = array_slice( - array_map( - null, $episodeDates, $episodeTrackLists), 0, min( - count($episodeDates), count($episodeTrackLists) - ) - ); - print_r($episodes); - - $stmt = $pdo->prepare("INSERT INTO songs (track, artist, date) VALUES (:track, :artist, :date)"); - $stmt->bindParam(':track', $trackName); - $stmt->bindParam(':artist', $artist); - $stmt->bindParam(':date', $date); - foreach(array_reverse($episodes) as $episode) { - $date = $episode[0]->format(SQLITE_DATE_FORM); - foreach ($episode[1] as $track) { - $trackName = $track['title']; - $artist = $track['artist']; + $startDate = new DateTime(end(array_keys($episodes))); + $page++; +} + +$stmt = $pdo->prepare("INSERT INTO songs (date, uri) VALUES (:date, :uri)"); +$stmt->bindParam(':date', $date); +$stmt->bindParam(':uri', $uri); +foreach(array_reverse($episodes) as $airDate => $trackIDs) { + $date = (new DateTime($airDate))->format(SQLITE_DATE_FORM); + foreach ($trackIDs as $trackID) { + $uri = "spotify:track:{$trackID}"; $stmt->execute(); - } } - - \ No newline at end of file +}