X-Git-Url: https://git.jacobcasper.com/?p=Marketplaylister.git;a=blobdiff_plain;f=marketscraper.php;fp=marketscraper.php;h=ff55a5e823892f842f9efaf62816d8c74127f7ab;hp=0000000000000000000000000000000000000000;hb=168896268ee9a19bce7125b029a37c34437aeb52;hpb=bed05b1a3c4b40cb71564fbebb2cb41242163dbd diff --git a/marketscraper.php b/marketscraper.php new file mode 100644 index 0000000..ff55a5e --- /dev/null +++ b/marketscraper.php @@ -0,0 +1,76 @@ +query('SELECT date FROM songs order by date desc limit 1'); + $resultset = $query->fetch(); + + $lastEpDT = new DateTime::createFromFormat(SQLITE_DATE_FORM, $resultset['date']; + $startDate = new DateTime; + $episodeDatePages = []; + $episodeTrackPages = []; + + while ($startDate > $lastEpDT) { + // DOM garbles UTF-8 chars, so loading them to HTML-ENTITIES data fixes this + $html = mb_convert_encoding(file_get_contents('https://www.marketplace.org/latest-music?page=' . $page), 'HTML-ENTITIES', "UTF-8"); + $DOM = new DOMDocument; + $DOM->loadHTML($html); + $headers = $DOM->getElementsByTagName('h2'); + $divs = $DOM->getElementsByTagName('div'); + + $episodeDatePages[] = parseEpisodeDate($headers, $lastEpDT); + $episodeTrackPages[] = parseEpisodePage($divs); + $startDate = end($episodeDatePages[$page - 1]); + $page++; + } + + //print_r($episodeDatePages); + + + //Unroll episodeDatePages + $episodeDates = []; + foreach ($episodeDatePages as $episodeDatePage) { + foreach ($episodeDatePage as $episodeDate) { + $episodeDates[] = $episodeDate; + } + } + + + // Unroll episodeTrackPages + $episodeTrackLists = []; + foreach ($episodeTrackPages as $epTrackPage) { + foreach ($epTrackPage as $epTrackList) { + $episodeTrackLists[] = $epTrackList; + } + } + + $episodes = array_slice( + array_map( + null, $episodeDates, $episodeTrackLists), 0, min( + count($episodeDates), count($episodeTrackLists) + ) + ); + print_r($episodes); + + $stmt = $pdo->prepare("INSERT INTO songs (track, artist, date) VALUES (:track, :artist, :date)"); + $stmt->bindParam(':track', $trackName); + $stmt->bindParam(':artist', $artist); + $stmt->bindParam(':date', $date); + foreach(array_reverse($episodes) as $episode) { + $date = $episode[0]->format(SQLITE_DATE_FORM); + foreach ($episode[1] as $track) { + $trackName = $track['title']; + $artist = $track['artist']; + $stmt->execute(); + } + } + + \ No newline at end of file