searchengine.py

import urllib
from BeautifulSoup import *
from urlparse import urljoin


class crawler:
    #init the crawler with the name of database
    def __init__(self, dbname):
        pass

    def __del__(self):
        pass

    def dbCommit(self):
        pass

    # Auxilliary function for getting an entry id and adding it if it's not present
    def getEntryId(self, table, field, value, createnew=True):
        return None

    # Index an individual page
    def addToIndex(self, url, soup):
        print('Indexing %s' %url)

    # Extract the text from an HTML page (no tags)
    def getTextOnly(self, soup):
        return None

    # Separate the words by any non-whitespace character
    def separateWords(self, text):
        return None

    # Return true if this url is already indexed
    def isIndexed(self, url):
        return False

    # Add a link between two pages
    def addLinkRef(self, urlFrom, urlTo, linkText):
        pass

    # Starting with a list of pages, do a breadth
    # first search to the given depth, indexing pages as we go
    def crawl(self, pages, depth=2):
        pass

    # Create the database tables
    def createIndexTables(self):
        pass