Skip to main content

python code to get url list of all videos of a channel

import scrapy
import re
import pickle
import json
import sys

class MySpider(scrapy.Spider):
    global allUrlFile , fullUrl
    allUrlFile = open('allUrl.txt', 'a')
    fullUrl = open('fullUrl.txt', 'a')
    localHost = "http://localhost:8050/render.html?url="    #youtubeUrl = "https://www.youtube.com/channel/UCv1Ybb65DkQmokXqfJn0Eig/channels" # channel with only one ajax    # youtubeUrl = "https://www.youtube.com/user/khanacademy/videos" # khan academy    youtubeUrl = "https://www.youtube.com/channel/UCU0kWLAbhVGxXarmE3b8rHg/videos" # khan hindi    start_urls = [localHost + youtubeUrl]
    name = "allvideos"    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url, self.parse)

    def parse(self, response):
        self.log("this program just visited "+ response.url)




        if not 'browse_ajax' in response.url:
            data = re.findall('"continuations":\[{"nextContinuationData"(.+?)\]', response.body.decode("utf-8"), re.S)
            # print(data)            strData = data[0].encode("utf-8")

            filename = "continuationToken.txt"            with open(filename, 'wb') as f:
                pickle.dump((strData), f)

            # pattern =r"continuation\":\"(\w+)"            pattern = r"continuation\":\"(.*)\","
            continuationToken = re.search(pattern, strData ,  re.MULTILINE ).group(1)
            # print(continuationToken)            hrefsInMainBody = []
            hrefsInMainBody = (response.css('a.yt-simple-endpoint.inline-block.style-scope.ytd-thumbnail::attr(href)').extract())
            hrefsInMainBody = [element.encode('ascii', 'xmlcharrefreplace') for element in hrefsInMainBody] # changing each element from unicode to String            print(hrefsInMainBody)

            for item in hrefsInMainBody:
                #remove /watch?v=                i = item.find('=')
                print(item + " and adding " + item[i+1:])
                allUrlFile.write("%s\n" % item[i+1:])
                fullUrl.write("%s\n" % item[i+1:])
                fullUrl.write("%s\n" % item)

            # call another function to fetch pending channel names from youtube of scroll equivalent            scrollUrl = "https://www.youtube.com/browse_ajax?ctoken=" + continuationToken
            finalScrollUrl = scrollUrl
            print("final scroll url \n" + finalScrollUrl)
            yield scrapy.Request(finalScrollUrl, callback=self.parse)
        else : #ajax scroll call            print("this is else ")
            jsonResponse = json.loads(response.text)
            # print(jsonResponse['content_html'] + "  lolaaaa")            loadMoreDatafromHtml = jsonResponse['load_more_widget_html']
            htmlInAjaxCall = jsonResponse['content_html']

            filename = "conentHtml.txt"            with open(filename, 'wb') as f:
                pickle.dump((htmlInAjaxCall), f)


            #get hrefs of current ajax call            hrefsInAjaxCallPattern = r"/watch\?v=.*?.\""            try:
                hrefsinAjaxCall = re.findall(hrefsInAjaxCallPattern , htmlInAjaxCall)
                hrefsinAjaxCall = [element.encode('ascii', 'xmlcharrefreplace') for element in hrefsinAjaxCall]
                hrefsinAjaxCall = list(set(hrefsinAjaxCall)) # removing duplicates using set
                for item in hrefsinAjaxCall:
                    i = item.find('=')
                    print("checking "+ item)
                    allUrlFile.write("%s\n" % item[i + 1:-1])
                    fullUrl.write("%s\n" % item)
                    fullUrl.write("%s\n" % item[i + 1:])


            except Exception as e:
                print("got caught in exception")
                print(e)
                filename = "exception.txt"                with open(filename, 'ab') as f:
                    pickle.dump((htmlInAjaxCall), f)
            print("ajax call urls")
            # print(hrefsinAjaxCall)            if loadMoreDatafromHtml:
                pattern = 'data-uix-load-more-href="(\/.+?)"' #getting next href for scroll
                continuationhref = re.search(pattern, loadMoreDatafromHtml).group(1)
                # print("new token "+ continuationhref)                scrollUrl = "https://www.youtube.com" + continuationhref
                finalScrollUrl = scrollUrl
                # print("final scroll url \n" + finalScrollUrl)                yield scrapy.Request(finalScrollUrl, callback=self.parse)
            else:
                print("empty load more data")


Comments

Popular posts from this blog

opening multiple ports tunnels ngrok in ubuntu

Location for the config yml file /home/example/.ngrok2/ngrok.yml content of config file authtoken: 4nq9771bPxe8ctg7LKr_2ClH7Y15Zqe4bWLWF9p tunnels: app-foo: addr: 80 proto: http host_header: app-foo.dev app-bar: addr: 80 proto: http host_header: app-bar.dev how to start ngrok with considering the config file: ngrok start --all

rename field in elastic Search

https://qiita.com/tkprof/items/e50368eb1473497a16d0 How to Rename an Elasticsearch field from columns: - {name: xxx, type: double} to columns: - {name: yyy, type: double} Pipeline API and reindex create a new Pipeline API : Rename Processor PUT _ingest/pipeline/pipeline_rename_xxx { "description" : "rename xxx", "processors" : [ { "rename": { "field": "xxx", "target_field": "yyy" } } ] } { "acknowledged": true } then reindex POST _reindex { "source": { "index": "source" }, "dest": { "index": "dest", "pipeline": "pipeline_rename_xxx" } }

Sumeru enterprise tiger privacy policy

Sumeru Enterprise Tiger Business Solutions Pvt. Ltd. Data Privacy Policy At Sumeru Enterprise Tiger Business Solutions Pvt. Ltd. we are committed to providing you with digitalization software products and services to meet your needs. Our commitment includes protecting personally identifiable information we obtain about you when you register to use one of our websites or become our customer (“Personal Information”). We want to earn your trust by providing strict safeguards to protect your Personal Information. This Policy applies to members, customers, former customers, users, and applicants. In the course of our business activities, Sumeru Enterprise Tiger Business Solutions Pvt. Ltd. collects, processes, and shares Personal Information. Indian law gives individuals the right to limit some but not all sharing. This Policy explains what Personal Information we collect, process, and share. We describe how we do so, and why. The Policy also describes your rights to access a...