Update README because I forgot how to develop this app.
[Marketplaylister.git] / marketscraper.php
1 <?php
2
3 require 'mpfuncs.php';
4
5 set_time_limit(0);
6
7 const SQLITE_DATE_FORM = 'Y-m-d H:i:s';
8
9 $pdo = new PDO("sqlite:mktplc.sqlite3");
10
11 $genDate = DateTime::createFromFormat(DATE_FORM, '1/1/2017');
12
13 $page = 1;
14
15 $query = $pdo->query('SELECT date FROM songs order by date desc limit 1');
16 $resultset = $query->fetch();
17
18 $lastEpDT = DateTime::createFromFormat(SQLITE_DATE_FORM, $resultset['date']);
19 $startDate = new DateTime;
20 $episodeDatePages = [];
21 $episodeTrackPages = [];
22
23 $episodes = [];
24 while ($startDate > $lastEpDT) {
25 // DOM garbles UTF-8 chars, so loading them to HTML-ENTITIES data fixes this
26 $html = mb_convert_encoding(file_get_contents('https://www.marketplace.org/latest-music/marketplace/page/' . $page), 'HTML-ENTITIES', "UTF-8");
27 $DOM = new DOMDocument;
28 $DOM->loadHTML($html);
29 $xpath = new DOMXPath($DOM);
30 $episodeData = $xpath->evaluate("//div[contains(@class, 'mp-music-card')]");
31 foreach($episodeData as $episode) {
32 $children = iterator_to_array($episode->childNodes);
33 $episodeHeadCard = array_pop(findChildWithClass($children, 'mp-music-card-episode'));
34 $episodeMeta = array_pop(findChildWithClass($episodeHeadCard->childNodes, 'mp-music-card-meta'));
35 $episodeDate = array_pop(findChildWithClass($episodeMeta->childNodes, 'mp-music-card-meta_pubdate'))->textContent;
36 if (!isset($episodeDate)) { continue; }
37 $trackDiv = array_pop(findChildWithClass($children, 'mp-music-card-tracks'));
38 $trackItems = findChildWithClass($trackDiv->childNodes, 'flex w-full flex-wrap item');
39 $trackIDs = [];
40 foreach($trackItems as $trackItem) {
41 $divs = findChildWithClass($trackItem->childNodes, 'w-full min-tablet:w-1/2');
42 foreach ($divs as $div) {
43 $trackIDs[] = array_pop(explode('/', array_pop(findChildWithClass($div->childNodes, 'song-title'))->attributes->getNamedItem('href')->value));
44 }
45 }
46 $episodes[$episodeDate] = $trackIDs;
47 }
48 $startDate = new DateTime(end(array_keys($episodes)));
49 $page++;
50 }
51
52 $stmt = $pdo->prepare("INSERT INTO songs (date, uri) VALUES (:date, :uri)");
53 $stmt->bindParam(':date', $date);
54 $stmt->bindParam(':uri', $uri);
55 foreach(array_reverse($episodes) as $airDate => $trackIDs) {
56 $date = (new DateTime($airDate))->format(SQLITE_DATE_FORM);
57 foreach ($trackIDs as $trackID) {
58 $uri = "spotify:track:{$trackID}";
59 $stmt->execute();
60 }
61 }