Skip to main content

python code to get url list of all videos of a channel

import scrapy
import re
import pickle
import json
import sys

class MySpider(scrapy.Spider):
    global allUrlFile , fullUrl
    allUrlFile = open('allUrl.txt', 'a')
    fullUrl = open('fullUrl.txt', 'a')
    localHost = "http://localhost:8050/render.html?url="    #youtubeUrl = "https://www.youtube.com/channel/UCv1Ybb65DkQmokXqfJn0Eig/channels" # channel with only one ajax    # youtubeUrl = "https://www.youtube.com/user/khanacademy/videos" # khan academy    youtubeUrl = "https://www.youtube.com/channel/UCU0kWLAbhVGxXarmE3b8rHg/videos" # khan hindi    start_urls = [localHost + youtubeUrl]
    name = "allvideos"    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url, self.parse)

    def parse(self, response):
        self.log("this program just visited "+ response.url)




        if not 'browse_ajax' in response.url:
            data = re.findall('"continuations":\[{"nextContinuationData"(.+?)\]', response.body.decode("utf-8"), re.S)
            # print(data)            strData = data[0].encode("utf-8")

            filename = "continuationToken.txt"            with open(filename, 'wb') as f:
                pickle.dump((strData), f)

            # pattern =r"continuation\":\"(\w+)"            pattern = r"continuation\":\"(.*)\","
            continuationToken = re.search(pattern, strData ,  re.MULTILINE ).group(1)
            # print(continuationToken)            hrefsInMainBody = []
            hrefsInMainBody = (response.css('a.yt-simple-endpoint.inline-block.style-scope.ytd-thumbnail::attr(href)').extract())
            hrefsInMainBody = [element.encode('ascii', 'xmlcharrefreplace') for element in hrefsInMainBody] # changing each element from unicode to String            print(hrefsInMainBody)

            for item in hrefsInMainBody:
                #remove /watch?v=                i = item.find('=')
                print(item + " and adding " + item[i+1:])
                allUrlFile.write("%s\n" % item[i+1:])
                fullUrl.write("%s\n" % item[i+1:])
                fullUrl.write("%s\n" % item)

            # call another function to fetch pending channel names from youtube of scroll equivalent            scrollUrl = "https://www.youtube.com/browse_ajax?ctoken=" + continuationToken
            finalScrollUrl = scrollUrl
            print("final scroll url \n" + finalScrollUrl)
            yield scrapy.Request(finalScrollUrl, callback=self.parse)
        else : #ajax scroll call            print("this is else ")
            jsonResponse = json.loads(response.text)
            # print(jsonResponse['content_html'] + "  lolaaaa")            loadMoreDatafromHtml = jsonResponse['load_more_widget_html']
            htmlInAjaxCall = jsonResponse['content_html']

            filename = "conentHtml.txt"            with open(filename, 'wb') as f:
                pickle.dump((htmlInAjaxCall), f)


            #get hrefs of current ajax call            hrefsInAjaxCallPattern = r"/watch\?v=.*?.\""            try:
                hrefsinAjaxCall = re.findall(hrefsInAjaxCallPattern , htmlInAjaxCall)
                hrefsinAjaxCall = [element.encode('ascii', 'xmlcharrefreplace') for element in hrefsinAjaxCall]
                hrefsinAjaxCall = list(set(hrefsinAjaxCall)) # removing duplicates using set
                for item in hrefsinAjaxCall:
                    i = item.find('=')
                    print("checking "+ item)
                    allUrlFile.write("%s\n" % item[i + 1:-1])
                    fullUrl.write("%s\n" % item)
                    fullUrl.write("%s\n" % item[i + 1:])


            except Exception as e:
                print("got caught in exception")
                print(e)
                filename = "exception.txt"                with open(filename, 'ab') as f:
                    pickle.dump((htmlInAjaxCall), f)
            print("ajax call urls")
            # print(hrefsinAjaxCall)            if loadMoreDatafromHtml:
                pattern = 'data-uix-load-more-href="(\/.+?)"' #getting next href for scroll
                continuationhref = re.search(pattern, loadMoreDatafromHtml).group(1)
                # print("new token "+ continuationhref)                scrollUrl = "https://www.youtube.com" + continuationhref
                finalScrollUrl = scrollUrl
                # print("final scroll url \n" + finalScrollUrl)                yield scrapy.Request(finalScrollUrl, callback=self.parse)
            else:
                print("empty load more data")


Comments

Popular posts from this blog

Gui logging in node js and python

For node.js Use  frontail for logging https://www.npmjs.com/package/frontail For Python -- use Cutelog https://pypi.org/project/cutelog/ In NodeJs for using frontail we need to use log the logs in a file for logging logs to file , we will use winston Using winston https://www.npmjs.com/package/winston Eg. of using winstonconst { createLogger, format, transports } = require('winston'); const { combine, timestamp, label, prettyPrint } = format; const logger = createLogger({   level: 'info',   format: format.json(),   transports: [     //     // - Write to all logs with level `info` and below to `combined.log`      // - Write all logs error (and below) to `error.log`.     //     new transports.File({ filename: 'error.log', level: 'error' }),     new transports.File({ filename: 'combined.log' })   ] }); logger.log({   level: 'info',   message: 'What time is...

opening multiple ports tunnels ngrok in ubuntu

Location for the config yml file /home/example/.ngrok2/ngrok.yml content of config file authtoken: 4nq9771bPxe8ctg7LKr_2ClH7Y15Zqe4bWLWF9p tunnels: app-foo: addr: 80 proto: http host_header: app-foo.dev app-bar: addr: 80 proto: http host_header: app-bar.dev how to start ngrok with considering the config file: ngrok start --all

rename field in elastic Search

https://qiita.com/tkprof/items/e50368eb1473497a16d0 How to Rename an Elasticsearch field from columns: - {name: xxx, type: double} to columns: - {name: yyy, type: double} Pipeline API and reindex create a new Pipeline API : Rename Processor PUT _ingest/pipeline/pipeline_rename_xxx { "description" : "rename xxx", "processors" : [ { "rename": { "field": "xxx", "target_field": "yyy" } } ] } { "acknowledged": true } then reindex POST _reindex { "source": { "index": "source" }, "dest": { "index": "dest", "pipeline": "pipeline_rename_xxx" } }