Update README because I forgot how to develop this app.
[Marketplaylister.git] / marketscraper.php
index ff55a5e..e4a14c4 100644 (file)
@@ -1,76 +1,61 @@
 <?php
 
-    require 'mpfuncs.php';
-    
-    const SQLITE_DATE_FORM = 'Y-m-d H:i:s';
-    
-    $pdo = new PDO("sqlite:mktplc.sqlite3");
-        
-    $genDate = DateTime::createFromFormat(DATE_FORM, '1/1/2017');
-    
-    $page = 1;
-    
-    $query = $pdo->query('SELECT date FROM songs order by date desc limit 1');
-    $resultset = $query->fetch();
+require 'mpfuncs.php';
 
-    $lastEpDT = new DateTime::createFromFormat(SQLITE_DATE_FORM, $resultset['date'];
-    $startDate = new DateTime;
-    $episodeDatePages = [];
-    $episodeTrackPages = [];
-    
-    while ($startDate > $lastEpDT) {
-      // DOM garbles UTF-8 chars, so loading them to HTML-ENTITIES data fixes this
-      $html = mb_convert_encoding(file_get_contents('https://www.marketplace.org/latest-music?page=' . $page), 'HTML-ENTITIES', "UTF-8");
-      $DOM = new DOMDocument;
-      $DOM->loadHTML($html);
-      $headers = $DOM->getElementsByTagName('h2');
-      $divs = $DOM->getElementsByTagName('div');
-      
-      $episodeDatePages[] = parseEpisodeDate($headers, $lastEpDT);
-      $episodeTrackPages[] = parseEpisodePage($divs);
-      $startDate = end($episodeDatePages[$page - 1]);
-      $page++;
-    }
-    
-    //print_r($episodeDatePages);
+set_time_limit(0);
 
-    
-    //Unroll episodeDatePages
-    $episodeDates = [];
-    foreach ($episodeDatePages as $episodeDatePage) {
-      foreach ($episodeDatePage as $episodeDate) {
-        $episodeDates[] = $episodeDate;
-      }
-    }
-    
-    
-    // Unroll episodeTrackPages
-    $episodeTrackLists = [];
-    foreach ($episodeTrackPages as $epTrackPage) {
-      foreach ($epTrackPage as $epTrackList) {
-        $episodeTrackLists[] = $epTrackList;
-      }
+const SQLITE_DATE_FORM = 'Y-m-d H:i:s';
+
+$pdo = new PDO("sqlite:mktplc.sqlite3");
+
+$genDate = DateTime::createFromFormat(DATE_FORM, '1/1/2017');
+
+$page = 1;
+
+$query = $pdo->query('SELECT date FROM songs order by date desc limit 1');
+$resultset = $query->fetch();
+
+$lastEpDT = DateTime::createFromFormat(SQLITE_DATE_FORM, $resultset['date']);
+$startDate = new DateTime;
+$episodeDatePages = [];
+$episodeTrackPages = [];
+
+$episodes = [];
+while ($startDate > $lastEpDT) {
+    // DOM garbles UTF-8 chars, so loading them to HTML-ENTITIES data fixes this
+    $html = mb_convert_encoding(file_get_contents('https://www.marketplace.org/latest-music/marketplace/page/' . $page), 'HTML-ENTITIES', "UTF-8");
+    $DOM = new DOMDocument;
+    $DOM->loadHTML($html);
+    $xpath = new DOMXPath($DOM);
+    $episodeData = $xpath->evaluate("//div[contains(@class, 'mp-music-card')]");
+    foreach($episodeData as $episode) {
+        $children = iterator_to_array($episode->childNodes);
+        $episodeHeadCard = array_pop(findChildWithClass($children, 'mp-music-card-episode'));
+        $episodeMeta = array_pop(findChildWithClass($episodeHeadCard->childNodes, 'mp-music-card-meta'));
+        $episodeDate = array_pop(findChildWithClass($episodeMeta->childNodes, 'mp-music-card-meta_pubdate'))->textContent;
+        if (!isset($episodeDate)) { continue; }
+        $trackDiv = array_pop(findChildWithClass($children, 'mp-music-card-tracks'));
+        $trackItems = findChildWithClass($trackDiv->childNodes, 'flex w-full flex-wrap item');
+        $trackIDs = [];
+        foreach($trackItems as $trackItem) {
+            $divs = findChildWithClass($trackItem->childNodes, 'w-full min-tablet:w-1/2');
+            foreach ($divs as $div) {
+                $trackIDs[] = array_pop(explode('/', array_pop(findChildWithClass($div->childNodes, 'song-title'))->attributes->getNamedItem('href')->value));
+            }
+        }
+        $episodes[$episodeDate] = $trackIDs;
     }
-    
-    $episodes = array_slice(
-                  array_map(
-                    null, $episodeDates, $episodeTrackLists), 0, min(
-                      count($episodeDates), count($episodeTrackLists)
-                    )
-                );
-    print_r($episodes);
-    
-    $stmt = $pdo->prepare("INSERT INTO songs (track, artist, date) VALUES (:track, :artist, :date)");
-    $stmt->bindParam(':track', $trackName);
-    $stmt->bindParam(':artist', $artist);
-    $stmt->bindParam(':date', $date);
-    foreach(array_reverse($episodes) as $episode) {
-      $date = $episode[0]->format(SQLITE_DATE_FORM);
-      foreach ($episode[1] as $track) {
-        $trackName = $track['title'];
-        $artist = $track['artist'];
+    $startDate = new DateTime(end(array_keys($episodes)));
+    $page++;
+}
+
+$stmt = $pdo->prepare("INSERT INTO songs (date, uri) VALUES (:date, :uri)");
+$stmt->bindParam(':date', $date);
+$stmt->bindParam(':uri', $uri);
+foreach(array_reverse($episodes) as $airDate => $trackIDs) {
+    $date = (new DateTime($airDate))->format(SQLITE_DATE_FORM);
+    foreach ($trackIDs as $trackID) {
+        $uri = "spotify:track:{$trackID}";
         $stmt->execute();
-      }
     }
-    
-    
\ No newline at end of file
+}