From: Jacob Casper Date: Sun, 17 Nov 2019 15:15:22 +0000 (-0600) Subject: v0.3.3: X-Git-Url: https://git.jacobcasper.com/?p=Marketplaylister.git;a=commitdiff_plain;h=86b8d9ed52d44c8d914906fe974c560df06b9e5b v0.3.3: Update scraper for site changes Fix formatting --- diff --git a/callback.php b/callback.php index b6211da..f47e73b 100644 --- a/callback.php +++ b/callback.php @@ -1,170 +1,169 @@ - 'January', - '02' => 'February', - '03' => 'March', - '04' => 'April', - '05' => 'May', - '06' => 'June', - '07' => 'July', - '08' => 'August', - '09' => 'September', - '10' => 'October', - '11' => 'November', - '12' => 'December', - ]; - - $code = $_GET['code']; - - // Get month and year user is requesting - $state = explode(':', $_GET['state']); - - - if (!$code) { - exit(1); - } - - #print_r($today); - - #Handle Spotify Token Authorization - - $token_data = [ - 'grant_type' => 'authorization_code', - 'code' => $code, - 'redirect_uri' => REDIRECT_URI - ]; - $token_data = http_build_query($token_data); - - $token_opts = [ - 'http' => [ - 'method' => 'POST', - /*'header' => "Content-type: application/x-www-form-urlencoded\r\n" - . "Content-Length: " . strlen($token_data) . "\r\n" - . "Authorization: Basic " . base64_encode('868e2cba00de4819900dd8a647a7ba7d:' . CLIENT_SECRET) . "\r\n",*/ - 'header' => "Authorization: Basic " . base64_encode('93a6f9c0375c45d4b348157691aa24e8:' . CLIENT_SECRET) . " \r\n", - 'content' => $token_data - ] - ]; - - $token_context = stream_context_create($token_opts); - - $spot_req = file_get_contents(AUTH_URL . 'api/token', false, $token_context); - - echo $spot_req; - $spot_json = json_decode($spot_req, true); - - $spot_token = $spot_json['access_token']; - - - $me_opts = [ - 'http' => [ - 'method' => 'GET', - 'header' => 'Authorization: Bearer ' . $spot_token . "\r\n" - ] - ]; - - $me_context = stream_context_create($me_opts); - - $me_resp = file_get_contents(BASE_URL . 'me', false, $me_context); - $me_json = json_decode($me_resp, true); - $me_id = $me_json['id']; - - # Check if this month's playlist exists - - $playlistName = MONTHS[$state[0]] . ' ' . $state[1] . ' Marketplace Tracks'; - - $checkPlaylistOpts = [ - 'http' => [ - 'method' => 'GET', - 'header' => 'Authorization: Bearer ' . $spot_token . "\r\n" - ] - ]; - - $checkPlaylistContext = stream_context_create($checkPlaylistOpts); - - $checkPlaylistReq = file_get_contents(BASE_URL . 'me/playlists', false, $checkPlaylistContext); - - $checkPlaylistJson = json_decode($checkPlaylistReq, true); - - foreach ($checkPlaylistJson['items'] as $playlist) { - #TODO should check if $user owns playlist - if (!strcmp($playlistName, $playlist['name'])) { - $playlistID = $playlist['id']; - } - } - - #echo 'playlistID' . $playlistID; - - # Create new playlist if one does not exist - # DEVELOPMENT TEMP ALWAYS CREATE NEW PLAYLIST - #if (!$playlistID) { - if (true) { - - $playlist_data = [ - 'name' => $playlistName, - 'description' => 'A playlist of Marketplace tracks by Marketplaylister.', - ]; - - $playlist_opts = [ - 'http' => [ - 'method' => 'POST', - 'header' => 'Authorization: Bearer ' . $spot_token . "\r\n" - . 'Content-Type application/json \r\n', - 'content' => json_encode($playlist_data) - ] - ]; - - $playlist_context = stream_context_create($playlist_opts); - $playlist_req = file_get_contents(BASE_URL . 'users/' . $me_id . '/playlists', false, $playlist_context); - $playlist_json = json_decode($playlist_req, true); - $playlistID = $playlist_json['id']; - - #echo '
' . $playlistID; - - } - - $uris = []; - - $pdo = new PDO("sqlite:mktplc.sqlite3"); - $stmt = $pdo->prepare("SELECT uri FROM songs s WHERE uri IS NOT NULL AND strftime('%m', s.date) == :month AND strftime('%Y', s.date) == :year"); - $stmt->bindParam(':month', $state[0]); - $stmt->bindParam(':year', $state[1]); - if ($stmt->execute()) { - while ($row = $stmt->fetch()) { - $uris[] = $row['uri']; - } - } - - $update_data = [ - 'uris' => $uris, - ]; - - echo '
update_data
'; - #print_r($update_data); - - $update_opts = [ - 'http' => [ - 'method' => 'POST', - 'header' => 'Authorization: Bearer ' . $spot_token . "\r\n" - . 'Content-Type application/json \r\n', - 'content' => json_encode($update_data) - ] - ]; - - $update_context = stream_context_create($update_opts); - $update_url = BASE_URL . 'users/' . $me_id . '/playlists/' . $playlistID . '/tracks'; - echo '
' . $update_url; - echo '
'; - echo '
' . count($uris); - echo '
'; - print_r(json_encode($update_data)); - $update_req = file_get_contents(BASE_URL . 'users/' . $me_id . '/playlists/' . $playlistID . '/tracks', false, $update_context); - print_r($update_req); \ No newline at end of file + 'January', + '02' => 'February', + '03' => 'March', + '04' => 'April', + '05' => 'May', + '06' => 'June', + '07' => 'July', + '08' => 'August', + '09' => 'September', + '10' => 'October', + '11' => 'November', + '12' => 'December', +]; + +$code = $_GET['code']; + +// Get month and year user is requesting +$state = explode(':', $_GET['state']); + + +if (!$code) { + exit(1); +} + +#print_r($today); + +#Handle Spotify Token Authorization + +$token_data = [ + 'grant_type' => 'authorization_code', + 'code' => $code, + 'redirect_uri' => REDIRECT_URI +]; +$token_data = http_build_query($token_data); + +$token_opts = [ + 'http' => [ + 'method' => 'POST', + /*'header' => "Content-type: application/x-www-form-urlencoded\r\n" + . "Content-Length: " . strlen($token_data) . "\r\n" + . "Authorization: Basic " . base64_encode('868e2cba00de4819900dd8a647a7ba7d:' . CLIENT_SECRET) . "\r\n",*/ + 'header' => "Authorization: Basic " . base64_encode('93a6f9c0375c45d4b348157691aa24e8:' . CLIENT_SECRET) . " \r\n", + 'content' => $token_data + ] +]; + +$token_context = stream_context_create($token_opts); + +$spot_req = file_get_contents(AUTH_URL . 'api/token', false, $token_context); + +echo $spot_req; +$spot_json = json_decode($spot_req, true); + +$spot_token = $spot_json['access_token']; + + +$me_opts = [ + 'http' => [ + 'method' => 'GET', + 'header' => 'Authorization: Bearer ' . $spot_token . "\r\n" + ] +]; + +$me_context = stream_context_create($me_opts); + +$me_resp = file_get_contents(BASE_URL . 'me', false, $me_context); +$me_json = json_decode($me_resp, true); +$me_id = $me_json['id']; + +# Check if this month's playlist exists + +$playlistName = MONTHS[$state[0]] . ' ' . $state[1] . ' Marketplace Tracks'; + +$checkPlaylistOpts = [ + 'http' => [ + 'method' => 'GET', + 'header' => 'Authorization: Bearer ' . $spot_token . "\r\n" + ] +]; + +$checkPlaylistContext = stream_context_create($checkPlaylistOpts); + +$checkPlaylistReq = file_get_contents(BASE_URL . 'me/playlists', false, $checkPlaylistContext); + +$checkPlaylistJson = json_decode($checkPlaylistReq, true); + +foreach ($checkPlaylistJson['items'] as $playlist) { + #TODO should check if $user owns playlist + if (!strcmp($playlistName, $playlist['name'])) { + $playlistID = $playlist['id']; + } +} + +# Create new playlist if one does not exist +# DEVELOPMENT TEMP ALWAYS CREATE NEW PLAYLIST +#if (!$playlistID) { + if (true) { + + $playlist_data = [ + 'name' => $playlistName, + 'description' => 'A playlist of Marketplace tracks by Marketplaylister.', + ]; + + $playlist_opts = [ + 'http' => [ + 'method' => 'POST', + 'header' => 'Authorization: Bearer ' . $spot_token . "\r\n" + . 'Content-Type: application/json \r\n', + 'content' => json_encode($playlist_data) + ] + ]; + + $playlist_context = stream_context_create($playlist_opts); + $playlist_req = file_get_contents(BASE_URL . 'users/' . $me_id . '/playlists', false, $playlist_context); + $playlist_json = json_decode($playlist_req, true); + $playlistID = $playlist_json['id']; + + + } + + $uris = []; + + $pdo = new PDO("sqlite:mktplc.sqlite3"); + $stmt = $pdo->prepare("SELECT uri FROM songs s WHERE uri IS NOT NULL AND strftime('%m', s.date) == :month AND strftime('%Y', s.date) == :year"); + $stmt->bindParam(':month', $state[0]); + $stmt->bindParam(':year', $state[1]); + if ($stmt->execute()) { + while ($row = $stmt->fetch()) { + $uris[] = $row['uri']; + } + } + + $update_data = [ + 'uris' => $uris, + ]; + + echo '
update_data
'; + + $update_opts = [ + 'http' => [ + 'method' => 'POST', + 'ignore_errors' => true, + 'header' => 'Authorization: Bearer ' . $spot_token . "\r\n" + . 'Content-Type: application/json \r\n' + . 'Accept: application/json \r\n', + 'content' => json_encode($update_data) + ] + ]; + + $update_context = stream_context_create($update_opts); + $update_url = BASE_URL . 'playlists/' . $playlistID . '/tracks'; + echo '
' . $update_url; + echo '
'; + echo '
' . count($uris); + echo '
'; + print_r(json_encode($update_data)); + $update_req = file_get_contents($update_url, false, $update_context); + var_dump($update_req); + diff --git a/marketplay.php b/marketplay.php index 6ff4b85..07e1de5 100644 --- a/marketplay.php +++ b/marketplay.php @@ -1,6 +1,7 @@ query('SELECT date FROM songs order by date desc limit 1'); - $resultset = $query->fetch(); +require 'mpfuncs.php'; - $lastEpDT = DateTime::createFromFormat(SQLITE_DATE_FORM, $resultset['date']); - $startDate = new DateTime; - $episodeDatePages = []; - $episodeTrackPages = []; - - while ($startDate > $lastEpDT) { - // DOM garbles UTF-8 chars, so loading them to HTML-ENTITIES data fixes this - $html = mb_convert_encoding(file_get_contents('https://www.marketplace.org/latest-music?page=' . $page), 'HTML-ENTITIES', "UTF-8"); - $DOM = new DOMDocument; - $DOM->loadHTML($html); - $headers = $DOM->getElementsByTagName('h2'); - $divs = $DOM->getElementsByTagName('div'); - - $episodeDatePages[] = parseEpisodeDate($headers, $lastEpDT); - $episodeTrackPages[] = parseEpisodePage($divs); - $startDate = end($episodeDatePages[$page - 1]); - $page++; - } - - //print_r($episodeDatePages); +set_time_limit(0); - - //Unroll episodeDatePages - $episodeDates = []; - foreach ($episodeDatePages as $episodeDatePage) { - foreach ($episodeDatePage as $episodeDate) { - $episodeDates[] = $episodeDate; - } - } - - - // Unroll episodeTrackPages - $episodeTrackLists = []; - foreach ($episodeTrackPages as $epTrackPage) { - foreach ($epTrackPage as $epTrackList) { - $episodeTrackLists[] = $epTrackList; - } +const SQLITE_DATE_FORM = 'Y-m-d H:i:s'; + +$pdo = new PDO("sqlite:mktplc.sqlite3"); + +$genDate = DateTime::createFromFormat(DATE_FORM, '1/1/2017'); + +$page = 1; + +$query = $pdo->query('SELECT date FROM songs order by date desc limit 1'); +$resultset = $query->fetch(); + +$lastEpDT = DateTime::createFromFormat(SQLITE_DATE_FORM, $resultset['date']); +$startDate = new DateTime; +$episodeDatePages = []; +$episodeTrackPages = []; + +$episodes = []; +while ($startDate > $lastEpDT) { + // DOM garbles UTF-8 chars, so loading them to HTML-ENTITIES data fixes this + $html = mb_convert_encoding(file_get_contents('https://www.marketplace.org/latest-music/marketplace/page/' . $page), 'HTML-ENTITIES', "UTF-8"); + $DOM = new DOMDocument; + $DOM->loadHTML($html); + $xpath = new DOMXPath($DOM); + $episodeData = $xpath->evaluate("//div[contains(@class, 'mp-music-card')]"); + foreach($episodeData as $episode) { + $children = iterator_to_array($episode->childNodes); + $episodeHeadCard = array_pop(findChildWithClass($children, 'mp-music-card-episode')); + $episodeMeta = array_pop(findChildWithClass($episodeHeadCard->childNodes, 'mp-music-card-meta')); + $episodeDate = array_pop(findChildWithClass($episodeMeta->childNodes, 'mp-music-card-meta_pubdate'))->textContent; + if (!isset($episodeDate)) { continue; } + $trackDiv = array_pop(findChildWithClass($children, 'mp-music-card-tracks')); + $trackItems = findChildWithClass($trackDiv->childNodes, 'flex w-full flex-wrap item'); + $trackIDs = []; + foreach($trackItems as $trackItem) { + $divs = findChildWithClass($trackItem->childNodes, 'w-full min-tablet:w-1/2'); + foreach ($divs as $div) { + $trackIDs[] = array_pop(explode('/', array_pop(findChildWithClass($div->childNodes, 'song-title'))->attributes->getNamedItem('href')->value)); + } + } + $episodes[$episodeDate] = $trackIDs; } - - $episodes = array_slice( - array_map( - null, $episodeDates, $episodeTrackLists), 0, min( - count($episodeDates), count($episodeTrackLists) - ) - ); - print_r($episodes); - - $stmt = $pdo->prepare("INSERT INTO songs (track, artist, date) VALUES (:track, :artist, :date)"); - $stmt->bindParam(':track', $trackName); - $stmt->bindParam(':artist', $artist); - $stmt->bindParam(':date', $date); - foreach(array_reverse($episodes) as $episode) { - $date = $episode[0]->format(SQLITE_DATE_FORM); - foreach ($episode[1] as $track) { - $trackName = $track['title']; - $artist = $track['artist']; + $startDate = new DateTime(end(array_keys($episodes))); + $page++; +} + +$stmt = $pdo->prepare("INSERT INTO songs (date, uri) VALUES (:date, :uri)"); +$stmt->bindParam(':date', $date); +$stmt->bindParam(':uri', $uri); +foreach(array_reverse($episodes) as $airDate => $trackIDs) { + $date = (new DateTime($airDate))->format(SQLITE_DATE_FORM); + foreach ($trackIDs as $trackID) { + $uri = "spotify:track:{$trackID}"; $stmt->execute(); - } } - - \ No newline at end of file +} diff --git a/mpfuncs.php b/mpfuncs.php index 870acaf..d5be144 100644 --- a/mpfuncs.php +++ b/mpfuncs.php @@ -1,48 +1,16 @@ -hasAttribute('class') && $div->getAttribute('class') === 'episode-music') { - $songs = []; - foreach ($div->childNodes as $row) { - $children = $row->childNodes[0]->childNodes; - $songs[] = [ - 'title' => $children[0]->nodeValue, - 'artist' => $children[1]->nodeValue - ]; - } - $episodePage[] = $songs; - } - - } - - return $episodePage; - - } - - /** - * Go through the DOM elements provided and pull out the Dates of all marketplace - * pod episodes in the provided list. - * - * @param DomNodeList $headers The elements with a header tag from the DOM - * @param DateTime $lastDate The date of the most recent episode from the DB - */ - function parseEpisodeDate(DomNodeList $headers, DateTime $lastDate): array { - $episodeDates = []; - foreach ($headers as $header) { - if ($header->hasAttribute('class') && $header->getAttribute('class') === 'river--hed') { - $episodeAnchorHref = $header->firstChild->getAttribute('href'); - $dateString = explode('/', $episodeAnchorHref)[3]; - $episodeDate = DateTime::createFromFormat(DATE_FORM, $dateString); - if ($episodeDate < $lastDate) { - break; - } - $episodeDates[] = $episodeDate; - } - } - return $episodeDates; - } +hasAttributes() + && $child->attributes->getNamedItem('class')->value == $class; + } + ); +} diff --git a/searchify.php b/searchify.php index 949abf9..a6391c0 100644 --- a/searchify.php +++ b/searchify.php @@ -1,48 +1,45 @@ prepare("SELECT * FROM SONGS WHERE uri IS NULL"); - $upstmt = $pdo->prepare("UPDATE songs SET (uri) = :uri WHERE id = :id"); - $upstmt->bindParam(':uri', $uri); - $upstmt->bindParam(':id', $id); - - if ($stmt->execute()) { - - #print_r($stmt->fetchAll()); - while ($row = $stmt->fetch()) { - +const BASE_URL = 'https://api.spotify.com/v1/'; +// Currently updated manually whenever I get one from the server +const SPOT_TOKEN = 'BQBU1Qs3ROpkN9CwlQNpZS00khdSU61zuejyKbjS4KiIszK8aiLaTd9TfPiSH0OsmtWStOVL7ym-QYEBWyLX3qlFIN5peit0n6_B-LLtz4C8KSh3Dxj5O3jf4HSWf3fFISC4cLbznfSV3QnpQ4vdnCTehz4vT8V54XDiG2hX275Uw_gDHzKjqFWQo249-rY42rBv7pf555wQ2PSBymuZMcDlIDEeAbGiyRI'; + +$pdo = new PDO("sqlite:mktplc.sqlite3"); + +$stmt = $pdo->prepare("SELECT * FROM SONGS WHERE uri IS NULL"); +$upstmt = $pdo->prepare("UPDATE songs SET (uri) = :uri WHERE id = :id"); +$upstmt->bindParam(':uri', $uri); +$upstmt->bindParam(':id', $id); + +if ($stmt->execute()) { + while ($row = $stmt->fetch()) { + $track_opts = [ 'http' => [ 'method' => 'GET', - 'header' => 'Authorization: Bearer ' . SPOT_TOKEN . "\r\n" + 'header' => 'Authorization: Bearer ' . SPOT_TOKEN . "\r\n" ] ]; - + $track_context = stream_context_create($track_opts); - + $track_search_url = BASE_URL . 'search?q=track:' . urlencode($row['track']) - . '+artist:' . urlencode($row['artist']) . '&type=track'; + . '+artist:' . urlencode($row['artist']) . '&type=track'; $trackReq = file_get_contents($track_search_url, false, $track_context); if ($trackReq) { - $trackJSON = json_decode($trackReq, true); - $trackJSON = $trackJSON['tracks']; - if ($trackJSON['total'] === 0) { - continue; - } + $trackJSON = json_decode($trackReq, true); + $trackJSON = $trackJSON['tracks']; + if ($trackJSON['total'] === 0) { + continue; + } - $uri = $trackJSON['items'][0]['uri']; - $id = $row['id']; - $upstmt->execute(); - - #rate limit - sleep(1); + $uri = $trackJSON['items'][0]['uri']; + $id = $row['id']; + $upstmt->execute(); + sleep(1); } - } - } \ No newline at end of file + } +} +