The foundry from dragondave

Foundry .url: source url .domains: domains of interest .melt(): get bytes of url resource into raw_content .raw_content: bytes of url resoure .centrifuge_callback: function to delete lxml bits .centrifuge(): get relevant bits of HTML, return bytes .centrifuged: bytes of useful HTML .alloy

class Foundry(object): def init(self, url, centrifuge_callback): self.url = url self.domains = DOMAINS self.domains.append(urlparse(self.url).netloc) self.melt() # self.content assert type(self.raw_content) == bytes self.centrifuge(centrifuge_callback) assert type(self.centrifuged) == bytes

def melt(self):
    """
    Return the binary data found at `url`.
    Sets self.raw_content
    Other versions might need to use cookies for login
    Or might use ricecooker.utils.downloader.read
    Or load from disk

    This version is just a placeholder.

    Input: self.url
    Output: html bytes
    """
    # return a binary representation of the URL
    self.raw_content = requests.get(self.url).content


def centrifuge(self, callback=None):
    """
    Select only the relevant parts of the HTML.

    Tasks:
    * Ensure CSS / encoding correct (is inline)
    * Ensure URLs fully referenced (lxml_tools.absolve)

    Option: replace entirely with conversion to/from Markdown
    Option: Additional cleaning of the HTML to remove other junk we don't care about
            e.g. CSS, gubbins attributes, misc. span/div

    Input: html bytes
    Output: html bytes
    """

    assert callback, "callback is required"
    root = lxml.html.fromstring(self.raw_content)
    main = callback(root)
    new_root = lxml.html.fromstring('<html><head><meta charset="UTF-8"><link rel="stylesheet" type="text/css" href="styles.css" preserve=true></head><body></body></html>') # PRESERVE
    body, = new_root.xpath("//body")
    body.append(main)
    absolve(body, self.url)
    self.centrifuged = lxml.html.tostring(new_root)

def alloy(self):
    root = lxml.html.fromstring(self.centrifuged)
    handled = {} # {url:filename}
    for attr in URL_ATTRS:
        tags = root.xpath("//*[@{}]".format(attr))
        for tag in tags:
            attribute_value = tag.attrib[attr]
            # ignore certain tags
            if PRESERVE in tag.attrib:
                continue
            for starter in ["#", "mailto:", "javascript:"]:
                if attribute_value.startswith(starter):
                    continue

            # MODIFICATION: Tag is now absolute
            tag.attrib[attr] = urljoin(self.url, tag.attrib[attr])


            if "youtube.com/" in tag.attrib or "youtu.be/" in tag.attrib:
                filename = handle_youtube(tag)  # placeholder, do something with file
                continue

            # TODO: handle links which are indirectly resources with a callback here.

            if attribute_value in handled:
                # If already handled this URL, rewrite and don't download
                tag.attrib[attr] = handled[attribute_value]
                continue

            response = get_resource(attribute_value)
            if response is None:
                if tag.tag == "a":
                    globalise(tag) # give it a pretty globe if it's a link we're not crawling.
                continue # bail out either way.

            # We now have a resource (specifically: request response) we must save.
            content_type = response.headers['Content-Type'].split(";")[0].strip()
            extension = mimetypes.guess_extension(content_type) or "" # .mp3
            filename = hashlib.sha1(attribute_value.encode('utf-8')).hexdigest() + extension


    self.alloyed = lxml.html.tostring(root)
    return self.alloyed

def wiki_xpath(root): content, = root.xpath("//div[@id='content']") subtitle, = content.xpath(".//div[@id='siteSub']") subtitle.getparent().remove(subtitle) return content

def mathplanet_xpath(root): content, = root.xpath("//article[@id='article']") return content

if name == "main": f = Foundry("https://www.mathplanet.com/education/pre-algebra/graphing-and-functions/graphing-linear-inequalities", mathplanet_xpath)

print (f.alloy())

dragondave / foundry Goto Github PK

foundry's Introduction

foundry's People

Contributors

Watchers

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent

Jobs