v0.3:
authorJacob Casper <dev@jacobcasper.com>
Wed, 4 Apr 2018 02:20:53 +0000 (22:20 -0400)
committerJacob Casper <dev@jacobcasper.com>
Wed, 4 Apr 2018 02:20:53 +0000 (22:20 -0400)
commit ca40ac9fa60bda69f4b7a3415ce4d97bd06257fc
Author: Jacob Casper <jacob.thomas.casper@gmail.com>
Date:   Tue Apr 3 22:18:45 2018 -0400

    Refactored scraping and Spotify search tools, added DB

callback.php
marketplay.php
marketscraper.php [new file with mode: 0644]
mktplc.sqlite3 [new file with mode: 0644]
mpfuncs.php
searchify.php [new file with mode: 0644]

index 7488530..2b92014 100644 (file)
         \r
     $code = $_GET['code'];\r
     \r
-    if (!$code) {\r
-        exit(1);\r
-    }\r
-    \r
-    $today = new DateTime;\r
-    \r
-    #print_r($today);\r
-    \r
-    $prevDTTxt = file_get_contents(DATE_FILE);\r
+    // Get month and year user is requesting\r
+    $state = explode(':', $_GET['state']);\r
     \r
-    $prevDT = $prevDTTxt ? DateTime::createFromFormat(DATE_FORM, $prevDTTxt) : DateTime::createFromFormat(DATE_FORM, $today->format('m/') . '01' . $today->format('/Y'));\r
     \r
-    if (strcmp($prevDT->format('m'), $today->format('m')) < 0) {\r
-        $prevDT = DateTime::createFromFormat(DATE_FORM, $today->format('m/') . '01' . $today->format('/Y'));\r
+    if (!$code) {\r
+        exit(1);\r
     }\r
-    \r
+        \r
+    #print_r($today);    \r
     \r
     #Handle Spotify Token Authorization\r
     \r
     \r
     $spot_req = file_get_contents(AUTH_URL . 'api/token', false, $token_context);\r
     \r
-    #echo $spot_req;\r
+    echo $spot_req;\r
     $spot_json = json_decode($spot_req, true);\r
 \r
     $spot_token = $spot_json['access_token'];\r
     \r
+    \r
     $me_opts = [\r
         'http' => [\r
             'method' => 'GET',\r
     $me_json = json_decode($me_resp, true);\r
     $me_id = $me_json['id'];\r
     \r
-    echo '<br />';\r
-    #print_r($me_resp);\r
-    \r
-    $page = 1;\r
-    $html = file_get_contents('https://www.marketplace.org/latest-music');\r
-    $DOM = new DOMDocument;\r
-    $DOM->loadHTML($html);\r
-    $headers = $DOM->getElementsByTagName('h2');\r
-    $divs = $DOM->getElementsByTagName('div');\r
-    \r
-    $recentEpDT;\r
-    $episodePages = [];\r
-    \r
-    foreach ($headers as $header) {\r
-        if ($header->hasAttribute('class') && $header->getAttribute('class') === 'river--hed') {\r
-            $recentEpDT = DateTime::createFromFormat(DATE_FORM, explode(':', $header->nodeValue)[0]);\r
-            break;\r
-        }\r
-    }\r
-    \r
-    $prevDate = (int) $prevDT->format('d');\r
-    $recentEpDate = (int) $recentEpDT->format('d');\r
-    $daysToGet = ($prevDate === 1) ? $recentEpDate : $recentEpDate - $prevDate;\r
-    $daysToGet = $daysToGet - 2 * (int) ($daysToGet / 7);\r
-    \r
-    if ($daysToGet === 0) {\r
-        echo 'No new episodes since last check.';\r
-        exit(0);\r
-    }\r
-\r
-    \r
-    do {\r
-        $episodePages[] = parseEpisodePage($divs, $daysToGet);\r
-    } while ($daysToGet > 0 && ($DOM->loadHTML(file_get_contents('https://www.marketplace.org/latest-music?page=' . ++$page))) && ($divs = $DOM->getElementsByTagName('div')) );\r
-\r
-    /*\r
-    echo '<br />';\r
-\r
-    print_r($date_headers);\r
-    \r
-    echo '<br />';\r
-    print_r($episodes);\r
-    */\r
-    \r
-    print_r($episodePages);\r
-    \r
     # Check if this month's playlist exists\r
     \r
-    $playlistName = MONTHS[$today->format('m')] . ' Marketplace Tracks';\r
+    $playlistName = MONTHS[$state[0]] . ' ' . $state[1] . ' Marketplace Tracks';\r
     \r
     $checkPlaylistOpts = [\r
         'http' => [\r
 \r
         $playlist_data = [\r
             'name' => $playlistName,\r
+            'description' => 'A playlist of Marketplace tracks by Marketplaylister.',\r
         ];\r
         \r
         $playlist_opts = [\r
     \r
     $uris = [];\r
     \r
-    foreach ( array_reverse($episodePages) as $episodes) {\r
-        foreach ( array_reverse($episodes) as $episode) {\r
-\r
-        $track_opts = [\r
-            'http' => [\r
-                'method' => 'GET',\r
-                'header' => 'Authorization: Bearer ' . $spot_token . "\r\n"\r
-            ]\r
-        ];\r
-    \r
-        $track_context = stream_context_create($track_opts);\r
-    \r
-            foreach ($episode as $song_info) {\r
-    \r
-                $track_search_url = BASE_URL . 'search?q=track:' . urlencode($song_info['title']) \r
-                                    . '+artist:' . urlencode($song_info['artist']) . '&type=track';\r
-             \r
-                #echo '<br />' . $track_search_url;\r
-                #echo '<br />';\r
-            \r
-                $trackReq = file_get_contents($track_search_url, false, $track_context);\r
-                if ($trackReq) {\r
-                    $trackJSON = json_decode($trackReq, true);\r
-                    $trackJSON = $trackJSON['tracks'];\r
-            \r
-                    print_r($trackJSON);\r
-                    \r
-                    if ($trackJSON['total'] === 0) {\r
-                        continue;\r
-                    }\r
-            \r
-                    $uris[] = $trackJSON['items'][0]['uri'];\r
-            \r
-                    #rate limit\r
-                    sleep(1);\r
-\r
-                }\r
-            }\r
-        }\r
+    $pdo = new PDO("sqlite:mktplc.sqlite3");\r
+    $stmt = $pdo->prepare("SELECT uri FROM songs s WHERE (SELECT strftime('%m', s.date) == :month and strftime('%Y', s.date) == :year;");\r
+    $stmt->bindParam(':month', $state[0]);\r
+    $stmt->bindParam(':year', $state[1]);\r
+    if ($stmt->execute()) {\r
+      while ($row = $stmt->fetch()) {\r
+        $uris[] = $row['uri'];\r
+      }\r
     }\r
         \r
     $update_data = [\r
index 361e2ae..b456ec8 100644 (file)
@@ -1,6 +1,6 @@
 <?php
     require 'secrets.php';
     $scopes = 'playlist-modify-private playlist-modify-public';
-    $url = 'https://accounts.spotify.com/en/authorize?response_type=code&client_id=868e2cba00de4819900dd8a647a7ba7d&scope=' . urlencode($scopes) . '&redirect_uri=' . urlencode(REDIRECT_URI) ;
+    $url = 'https://accounts.spotify.com/en/authorize?response_type=code&client_id=868e2cba00de4819900dd8a647a7ba7d&scope=' . urlencode($scopes) . '&redirect_uri=' . urlencode(REDIRECT_URI) . '&state=' . $_GET['state'];
     header('Location: ' .$url);
     exit();
diff --git a/marketscraper.php b/marketscraper.php
new file mode 100644 (file)
index 0000000..ff55a5e
--- /dev/null
@@ -0,0 +1,76 @@
+<?php
+
+    require 'mpfuncs.php';
+    
+    const SQLITE_DATE_FORM = 'Y-m-d H:i:s';
+    
+    $pdo = new PDO("sqlite:mktplc.sqlite3");
+        
+    $genDate = DateTime::createFromFormat(DATE_FORM, '1/1/2017');
+    
+    $page = 1;
+    
+    $query = $pdo->query('SELECT date FROM songs order by date desc limit 1');
+    $resultset = $query->fetch();
+
+    $lastEpDT = new DateTime::createFromFormat(SQLITE_DATE_FORM, $resultset['date'];
+    $startDate = new DateTime;
+    $episodeDatePages = [];
+    $episodeTrackPages = [];
+    
+    while ($startDate > $lastEpDT) {
+      // DOM garbles UTF-8 chars, so loading them to HTML-ENTITIES data fixes this
+      $html = mb_convert_encoding(file_get_contents('https://www.marketplace.org/latest-music?page=' . $page), 'HTML-ENTITIES', "UTF-8");
+      $DOM = new DOMDocument;
+      $DOM->loadHTML($html);
+      $headers = $DOM->getElementsByTagName('h2');
+      $divs = $DOM->getElementsByTagName('div');
+      
+      $episodeDatePages[] = parseEpisodeDate($headers, $lastEpDT);
+      $episodeTrackPages[] = parseEpisodePage($divs);
+      $startDate = end($episodeDatePages[$page - 1]);
+      $page++;
+    }
+    
+    //print_r($episodeDatePages);
+
+    
+    //Unroll episodeDatePages
+    $episodeDates = [];
+    foreach ($episodeDatePages as $episodeDatePage) {
+      foreach ($episodeDatePage as $episodeDate) {
+        $episodeDates[] = $episodeDate;
+      }
+    }
+    
+    
+    // Unroll episodeTrackPages
+    $episodeTrackLists = [];
+    foreach ($episodeTrackPages as $epTrackPage) {
+      foreach ($epTrackPage as $epTrackList) {
+        $episodeTrackLists[] = $epTrackList;
+      }
+    }
+    
+    $episodes = array_slice(
+                  array_map(
+                    null, $episodeDates, $episodeTrackLists), 0, min(
+                      count($episodeDates), count($episodeTrackLists)
+                    )
+                );
+    print_r($episodes);
+    
+    $stmt = $pdo->prepare("INSERT INTO songs (track, artist, date) VALUES (:track, :artist, :date)");
+    $stmt->bindParam(':track', $trackName);
+    $stmt->bindParam(':artist', $artist);
+    $stmt->bindParam(':date', $date);
+    foreach(array_reverse($episodes) as $episode) {
+      $date = $episode[0]->format(SQLITE_DATE_FORM);
+      foreach ($episode[1] as $track) {
+        $trackName = $track['title'];
+        $artist = $track['artist'];
+        $stmt->execute();
+      }
+    }
+    
+    
\ No newline at end of file
diff --git a/mktplc.sqlite3 b/mktplc.sqlite3
new file mode 100644 (file)
index 0000000..5daf12d
Binary files /dev/null and b/mktplc.sqlite3 differ
index a35c89f..051619a 100644 (file)
@@ -1,12 +1,12 @@
 <?php\r
+\r
+    const DATE_FORM = 'm/d/Y';\r
+\r
     \r
-    function parseEpisodePage(DomNodeList $divs, int &$numDays): array {\r
+    function parseEpisodePage(DomNodeList $divs): array {\r
         $episodePage = [];\r
         foreach ($divs as $div) {\r
             if ($div->hasAttribute('class') && $div->getAttribute('class') === 'episode-music') {\r
-                if (!$numDays) {\r
-                    break;\r
-                }\r
                 $songs = [];\r
                 foreach ($div->childNodes as $row) {\r
                     $children = $row->childNodes[0]->childNodes;\r
@@ -15,7 +15,6 @@
                         'artist' => $children[1]->nodeValue\r
                     ];\r
                 }\r
-                $numDays--;\r
                 $episodePage[] = $songs;\r
             }\r
             \r
         \r
         return $episodePage;\r
         \r
+    }\r
+    \r
+    /**\r
+     * Go through the DOM elements provided and pull out the Dates of all marketplace\r
+     * pod episodes in the provided list.\r
+     * \r
+     * @param DomNodeList $headers The elements with a header tag from the DOM\r
+     * @param DateTime $lastDate The date of the most recent episode from the DB\r
+     */\r
+    function parseEpisodeDate(DomNodeList $headers, DateTime $lastDate): array {\r
+      $episodeDates = [];\r
+      foreach ($headers as $header) {\r
+        if ($header->hasAttribute('class') && $header->getAttribute('class') === 'river--hed') {\r
+          $dateStringParts = explode('/', explode(':', $header->nodeValue)[0]);\r
+          if ( strlen($dateStringParts[2]) === 2 ) {\r
+            $dateStringParts[2] = '20' . $dateStringParts[2];\r
+          }\r
+          $episodeDate = DateTime::createFromFormat(DATE_FORM, implode("/", $dateStringParts));\r
+          if ($episodeDate < $lastDate) {\r
+            break;\r
+          }\r
+          $episodeDates[] = $episodeDate;\r
+        }\r
+      }\r
+      return $episodeDates;\r
     }
\ No newline at end of file
diff --git a/searchify.php b/searchify.php
new file mode 100644 (file)
index 0000000..740f79f
--- /dev/null
@@ -0,0 +1,49 @@
+<?php
+
+    const BASE_URL = 'https://api.spotify.com/v1/';
+    // Currently updated manually whenever I get one from the server
+    const SPOT_TOKEN = 'BQCQTtwO2kiMcV_VDgfSmTXQzGlO47rUuPyc4oCHpRunPQx2ZhhVYOtksVZPMbSgoCy3cGiRMHMygon5-SleqfsP0lvRMQW3gm1Q_a8TRv5MfCGQdNwdUcUu_NBpcSjWDNUadWeg3ps-WTDWxjUWm_FOlfxMy7a2AdI_RHWZ0Lx56WHf8gYA4-YVUm_HxpqDlReqEkWE9DHppQ';
+  
+    $pdo = new PDO("sqlite:mktplc.sqlite3");
+    
+    $stmt = $pdo->prepare("SELECT * FROM SONGS WHERE uri IS NULL");
+    $upstmt = $pdo->prepare("UPDATE songs SET (uri) = :uri WHERE id = :id");
+    $upstmt->bindParam(':uri', $uri);
+    $upstmt->bindParam(':id', $id);
+    
+    if ($stmt->execute()) {
+      
+      print_r($stmt->fetchAll());
+      exit(0);
+      while ($row = $stmt->fetch()) {
+        
+        $track_opts = [
+            'http' => [
+                'method' => 'GET',
+                'header' => 'Authorization: Bearer ' . SPOT_TOKEN . "\r\n"
+            ]
+        ];
+        
+        $track_context = stream_context_create($track_opts);
+        
+        $track_search_url = BASE_URL . 'search?q=track:' . urlencode($row['track']) 
+                            . '+artist:' . urlencode($row['artist']) . '&type=track';
+
+        $trackReq = file_get_contents($track_search_url, false, $track_context);
+        if ($trackReq) {
+          $trackJSON = json_decode($trackReq, true);
+          $trackJSON = $trackJSON['tracks'];
+          if ($trackJSON['total'] === 0) {
+            continue;
+          }
+
+          $uri = $trackJSON['items'][0]['uri'];
+          $id = $row['id'];
+          $upstmt->execute();
+
+          #rate limit
+          sleep(1);
+
+        }
+      }
+    }
\ No newline at end of file