2015-06-21 1 views
0

Я использую python 2.7.8. Код, который я следую, от имени книги от «Программирование коллективной разведки O Reilly by Toby Segaram» глава 4: Я не знаю, почему эта ошибка пришла или автор ошибся? Я также пытался найти pysqlite2, но не смог. Любая идея, почему это проблема?ImportError: Нет модуля с именем pysqlite2 code from Programming Коллективный интеллект O Reilly by Toby Segaram глава 4

вот код:

import urllib2 
from BeautifulSoup import * 
from urlparse import urljoin 
from pysqlite2 import dbapi2 as sqlite 
import nn 
mynet=nn.searchnet('nn.db') 

# Create a list of words to ignore 
ignorewords={'the':1,'of':1,'to':1,'and':1,'a':1,'in':1,'is':1,'it':1} 


class crawler: 
    # Initialize the crawler with the name of database 
    def __init__(self,dbname): 
    self.con=sqlite.connect(dbname) 

    def __del__(self): 
    self.con.close() 

    def dbcommit(self): 
    self.con.commit() 

    # Auxilliary function for getting an entry id and adding 
    # it if it's not present 
    def getentryid(self,table,field,value,createnew=True): 
    cur=self.con.execute(
    "select rowid from %s where %s='%s'" % (table,field,value)) 
    res=cur.fetchone() 
    if res==None: 
     cur=self.con.execute(
     "insert into %s (%s) values ('%s')" % (table,field,value)) 
     return cur.lastrowid 
    else: 
     return res[0] 


    # Index an individual page 
    def addtoindex(self,url,soup): 
    if self.isindexed(url): return 
    print 'Indexing '+url 

    # Get the individual words 
    text=self.gettextonly(soup) 
    words=self.separatewords(text) 

    # Get the URL id 
    urlid=self.getentryid('urllist','url',url) 

    # Link each word to this url 
    for i in range(len(words)): 
     word=words[i] 
     if word in ignorewords: continue 
     wordid=self.getentryid('wordlist','word',word) 
     self.con.execute("insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)" % (urlid,wordid,i)) 



    # Extract the text from an HTML page (no tags) 
    def gettextonly(self,soup): 
    v=soup.string 
    if v==Null: 
     c=soup.contents 
     resulttext='' 
     for t in c: 
     subtext=self.gettextonly(t) 
     resulttext+=subtext+'\n' 
     return resulttext 
    else: 
     return v.strip() 

    # Seperate the words by any non-whitespace character 
    def separatewords(self,text): 
    splitter=re.compile('\\W*') 
    return [s.lower() for s in splitter.split(text) if s!=''] 


    # Return true if this url is already indexed 
    def isindexed(self,url): 
    return False 

    # Add a link between two pages 
    def addlinkref(self,urlFrom,urlTo,linkText): 
    words=self.separateWords(linkText) 
    fromid=self.getentryid('urllist','url',urlFrom) 
    toid=self.getentryid('urllist','url',urlTo) 
    if fromid==toid: return 
    cur=self.con.execute("insert into link(fromid,toid) values (%d,%d)" % (fromid,toid)) 
    linkid=cur.lastrowid 
    for word in words: 
     if word in ignorewords: continue 
     wordid=self.getentryid('wordlist','word',word) 
     self.con.execute("insert into linkwords(linkid,wordid) values (%d,%d)" % (linkid,wordid)) 

    # Starting with a list of pages, do a breadth 
    # first search to the given depth, indexing pages 
    # as we go 
    def crawl(self,pages,depth=2): 
    for i in range(depth): 
     newpages={} 
     for page in pages: 
     try: 
      c=urllib2.urlopen(page) 
     except: 
      print "Could not open %s" % page 
      continue 
     try: 
      soup=BeautifulSoup(c.read()) 
      self.addtoindex(page,soup) 

      links=soup('a') 
      for link in links: 
      if ('href' in dict(link.attrs)): 
       url=urljoin(page,link['href']) 
       if url.find("'")!=-1: continue 
       url=url.split('#')[0] # remove location portion 
       if url[0:4]=='http' and not self.isindexed(url): 
       newpages[url]=1 
       linkText=self.gettextonly(link) 
       self.addlinkref(page,url,linkText) 

      self.dbcommit() 
     except: 
      print "Could not parse page %s" % page 

     pages=newpages 


    # Create the database tables 
    def createindextables(self): 
    self.con.execute('create table urllist(url)') 
    self.con.execute('create table wordlist(word)') 
    self.con.execute('create table wordlocation(urlid,wordid,location)') 
    self.con.execute('create table link(fromid integer,toid integer)') 
    self.con.execute('create table linkwords(wordid,linkid)') 
    self.con.execute('create index wordidx on wordlist(word)') 
    self.con.execute('create index urlidx on urllist(url)') 
    self.con.execute('create index wordurlidx on wordlocation(wordid)') 
    self.con.execute('create index urltoidx on link(toid)') 
    self.con.execute('create index urlfromidx on link(fromid)') 
    self.dbcommit() 

    def calculatepagerank(self,iterations=20): 
    # clear out the current page rank tables 
    self.con.execute('drop table if exists pagerank') 
    self.con.execute('create table pagerank(urlid primary key,score)') 

    # initialize every url with a page rank of 1 
    for (urlid,) in self.con.execute('select rowid from urllist'): 
     self.con.execute('insert into pagerank(urlid,score) values (%d,1.0)' % urlid) 
    self.dbcommit() 

    for i in range(iterations): 
     print "Iteration %d" % (i) 
     for (urlid,) in self.con.execute('select rowid from urllist'): 
     pr=0.15 

     # Loop through all the pages that link to this one 
     for (linker,) in self.con.execute(
     'select distinct fromid from link where toid=%d' % urlid): 
      # Get the page rank of the linker 
      linkingpr=self.con.execute(
      'select score from pagerank where urlid=%d' % linker).fetchone()[0] 

      # Get the total number of links from the linker 
      linkingcount=self.con.execute(
      'select count(*) from link where fromid=%d' % linker).fetchone()[0] 
      pr+=0.85*(linkingpr/linkingcount) 
     self.con.execute(
     'update pagerank set score=%f where urlid=%d' % (pr,urlid)) 
     self.dbcommit() 

class searcher: 
    def __init__(self,dbname): 
    self.con=sqlite.connect(dbname) 

    def __del__(self): 
    self.con.close() 

    def getmatchrows(self,q): 
    # Strings to build the query 
    fieldlist='w0.urlid' 
    tablelist='' 
    clauselist='' 
    wordids=[] 

    # Split the words by spaces 
    words=q.split(' ') 
    tablenumber=0 

    for word in words: 
     # Get the word ID 
     wordrow=self.con.execute(
     "select rowid from wordlist where word='%s'" % word).fetchone() 
     if wordrow!=None: 
     wordid=wordrow[0] 
     wordids.append(wordid) 
     if tablenumber>0: 
      tablelist+=',' 
      clauselist+=' and ' 
      clauselist+='w%d.urlid=w%d.urlid and ' % (tablenumber-1,tablenumber) 
     fieldlist+=',w%d.location' % tablenumber 
     tablelist+='wordlocation w%d' % tablenumber  
     clauselist+='w%d.wordid=%d' % (tablenumber,wordid) 
     tablenumber+=1 

    # Create the query from the separate parts 
    fullquery='select %s from %s where %s' % (fieldlist,tablelist,clauselist) 
    print fullquery 
    cur=self.con.execute(fullquery) 
    rows=[row for row in cur] 

    return rows,wordids 

    def getscoredlist(self,rows,wordids): 
    totalscores=dict([(row[0],0) for row in rows]) 

    # This is where we'll put our scoring functions 
    weights=[(1.0,self.locationscore(rows)), 
      (1.0,self.frequencyscore(rows)), 
      (1.0,self.pagerankscore(rows)), 
      (1.0,self.linktextscore(rows,wordids)), 
      (5.0,self.nnscore(rows,wordids))] 
    for (weight,scores) in weights: 
     for url in totalscores: 
     totalscores[url]+=weight*scores[url] 

    return totalscores 

    def geturlname(self,id): 
    return self.con.execute(
    "select url from urllist where rowid=%d" % id).fetchone()[0] 

    def query(self,q): 
    rows,wordids=self.getmatchrows(q) 
    scores=self.getscoredlist(rows,wordids) 
    rankedscores=[(score,url) for (url,score) in scores.items()] 
    rankedscores.sort() 
    rankedscores.reverse() 
    for (score,urlid) in rankedscores[0:10]: 
     print '%f\t%s' % (score,self.geturlname(urlid)) 
    return wordids,[r[1] for r in rankedscores[0:10]] 

    def normalizescores(self,scores,smallIsBetter=0): 
    vsmall=0.00001 # Avoid division by zero errors 
    if smallIsBetter: 
     minscore=min(scores.values()) 
     return dict([(u,float(minscore)/max(vsmall,l)) for (u,l) in scores.items()]) 
    else: 
     maxscore=max(scores.values()) 
     if maxscore==0: maxscore=vsmall 
     return dict([(u,float(c)/maxscore) for (u,c) in scores.items()]) 

    def frequencyscore(self,rows): 
    counts=dict([(row[0],0) for row in rows]) 
    for row in rows: counts[row[0]]+=1 
    return self.normalizescores(counts) 

    def locationscore(self,rows): 
    locations=dict([(row[0],1000000) for row in rows]) 
    for row in rows: 
     loc=sum(row[1:]) 
     if loc<locations[row[0]]: locations[row[0]]=loc 

    return self.normalizescores(locations,smallIsBetter=1) 

    def distancescore(self,rows): 
    # If there's only one word, everyone wins! 
    if len(rows[0])<=2: return dict([(row[0],1.0) for row in rows]) 

    # Initialize the dictionary with large values 
    mindistance=dict([(row[0],1000000) for row in rows]) 

    for row in rows: 
     dist=sum([abs(row[i]-row[i-1]) for i in range(2,len(row))]) 
     if dist<mindistance[row[0]]: mindistance[row[0]]=dist 
    return self.normalizescores(mindistance,smallIsBetter=1) 

    def inboundlinkscore(self,rows): 
    uniqueurls=dict([(row[0],1) for row in rows]) 
    inboundcount=dict([(u,self.con.execute('select count(*) from link where toid=%d' % u).fetchone()[0]) for u in uniqueurls]) 
    return self.normalizescores(inboundcount) 

    def linktextscore(self,rows,wordids): 
    linkscores=dict([(row[0],0) for row in rows]) 
    for wordid in wordids: 
     cur=self.con.execute('select link.fromid,link.toid from linkwords,link where wordid=%d and linkwords.linkid=link.rowid' % wordid) 
     for (fromid,toid) in cur: 
     if toid in linkscores: 
      pr=self.con.execute('select score from pagerank where urlid=%d' % fromid).fetchone()[0] 
      linkscores[toid]+=pr 
    maxscore=max(linkscores.values()) 
    normalizedscores=dict([(u,float(l)/maxscore) for (u,l) in linkscores.items()]) 
    return normalizedscores 

    def pagerankscore(self,rows): 
    pageranks=dict([(row[0],self.con.execute('select score from pagerank where urlid=%d' % row[0]).fetchone()[0]) for row in rows]) 
    maxrank=max(pageranks.values()) 
    normalizedscores=dict([(u,float(l)/maxrank) for (u,l) in pageranks.items()]) 
    return normalizedscores 

    def nnscore(self,rows,wordids): 
    # Get unique URL IDs as an ordered list 
    urlids=[urlid for urlid in dict([(row[0],1) for row in rows])] 
    nnres=mynet.getresult(wordids,urlids) 
    scores=dict([(urlids[i],nnres[i]) for i in range(len(urlids))]) 
    return self.normalizescores(scores) 

ответ

1

Эта книга восемь лет, и относится к Python версии 2.4. Вы пропустили шаг, который установит модуль pysqlite2, но для версий Python с версии 2.5 вам это не нужно: просто сделайте import sqlite3 as sqlite.

+0

спасибо. Ошибка преодолевается, но ничего не происходит в соответствии с книгой, которую он печатает в баллах и ссылках. Но как вы сказали, что его 8 лет, то какое изменение я должен сделать, чтобы получить эти правильные результаты? – user3162878

+0

Я также нашел db по этой ссылке https://code.google.com/p/programming-collective-intelligence/source/browse/trunk/src/?r=9. я думал, что это может привести к некоторым результатам, но не к успеху. – user3162878

+0

может ответить u ??? – user3162878