|
| 1 | +import requests |
| 2 | +from bs4 import BeautifulSoup |
| 3 | +import csv |
| 4 | +import secrets |
| 5 | +import urllib |
| 6 | +import re |
| 7 | + |
| 8 | +baseURL = 'http://www.worldcat.org/webservices/catalog/search/opensearch?q=' |
| 9 | +baseURL2 = 'http://www.worldcat.org/webservices/catalog/content/' |
| 10 | +wskey = secrets.wskey |
| 11 | +f=csv.writer(open('oclcTitleDateSearchMatches.csv', 'wb')) |
| 12 | +f.writerow(['bibNumber']+['searchTitle']+['searchDate']+['searchType']+['oclcTitle']+['date']+['oclcNum']+['url']+['author']+['publisher']+['physDesc']+['encoding']) |
| 13 | +f2=csv.writer(open('oclcTitleDateSearchNonMatches.csv', 'wb')) |
| 14 | +f2.writerow(['bibNumber']+['searchTitle']+['searchDate']) |
| 15 | +with open('oclcRecordsTitle.csv') as csvfile: |
| 16 | + reader = csv.DictReader(csvfile) |
| 17 | + for row in reader: |
| 18 | + bibNumber = row['bib#'] |
| 19 | + print bibNumber |
| 20 | + searchType = 'date & title' |
| 21 | + searchTitle = row['245 - all subfields'][2:] |
| 22 | + if row['260 - all subfields'] != '': |
| 23 | + if 'c' in row['260 - all subfields']: |
| 24 | + searchDate = row['260 - all subfields'] |
| 25 | + searchDate = searchDate[searchDate.index('c')+2:].strip() |
| 26 | + searchDate = re.sub('[^\d-]+', '', searchDate) |
| 27 | + query = 'srw.yr+%3D+"'+searchDate+'"+and+' |
| 28 | + else: |
| 29 | + query = '' |
| 30 | + elif row['260 - all subfields'] != '': |
| 31 | + if 'c' in row['264 - all subfields']: |
| 32 | + searchDate = row['264 - all subfields'] |
| 33 | + searchDate = searchDate[searchDate.index('c')+2:].strip() |
| 34 | + searchDate = re.sub('[^\d-]+', '', searchDate) |
| 35 | + query = 'srw.yr+%3D+"'+searchDate+'"+and+' |
| 36 | + else: |
| 37 | + query = '' |
| 38 | + else: |
| 39 | + query = '' |
| 40 | + originalTitle = searchTitle |
| 41 | + if 'b' in searchTitle: |
| 42 | + searchTitle = searchTitle[:searchTitle.index('b')] + ' ' + searchTitle[searchTitle.index('b')+2:] |
| 43 | + if 'c' in searchTitle: |
| 44 | + searchTitle = searchTitle[:searchTitle.index('c')] |
| 45 | + else: |
| 46 | + pass |
| 47 | + elif 'c' in searchTitle: |
| 48 | + searchTitle = searchTitle[:searchTitle.index('c')] |
| 49 | + else: |
| 50 | + pass |
| 51 | + searchTitleURL = urllib.quote(searchTitle).strip() |
| 52 | + query = baseURL + query + 'srw.ti+%3D+"'+ searchTitleURL + '"&format=rss&wskey='+wskey |
| 53 | + print query |
| 54 | + response = requests.get(query).content |
| 55 | + records = BeautifulSoup(response, 'lxml').findAll('item') |
| 56 | + if records != []: |
| 57 | + for record in records: |
| 58 | + oclcTitle = record.find('title').text.encode('utf-8') |
| 59 | + url = record.find('guid').text.encode('utf-8') |
| 60 | + oclcNum = url.replace('http://worldcat.org/oclc/','') |
| 61 | + author = record.find('author').find('name').text.encode('utf-8') |
| 62 | + response2 = requests.get(baseURL2+oclcNum+'?servicelevel=full&classificationScheme=LibraryOfCongress&wskey='+wskey).content |
| 63 | + record2 = BeautifulSoup(response2, "lxml").find('record') |
| 64 | + encoding = record2.find('leader').text[17].encode('utf-8') |
| 65 | + type = record2.find('controlfield', {'tag' : '008'}).text[23:24] |
| 66 | + date = record2.find('controlfield', {'tag' : '008'}).text[7:11].encode('utf-8') |
| 67 | + try: |
| 68 | + publisher = record2.find('datafield', {'tag' : '260'}).find('subfield', {'code' : 'b'}).text.encode('utf-8') |
| 69 | + except: |
| 70 | + try: |
| 71 | + publisher = record2.find('datafield', {'tag' : '264'}).find('subfield', {'code' : 'b'}).text.encode('utf-8') |
| 72 | + except: |
| 73 | + publisher = '' |
| 74 | + try: |
| 75 | + catLang = record2.find('datafield', {'tag' : '040'}).find('subfield', {'code' : 'b'}).text.encode('utf-8') |
| 76 | + except: |
| 77 | + catLang = '' |
| 78 | + try: |
| 79 | + physDesc = record2.find('datafield', {'tag' : '300'}).find('subfield', {'code' : 'a'}).text.encode('utf-8') |
| 80 | + except: |
| 81 | + physDesc = '' |
| 82 | + if type == ' ' and (catLang == 'eng' or catLang == ''): |
| 83 | + f.writerow([bibNumber]+[searchTitle]+[searchDate]+[searchType]+[oclcTitle]+[date]+[oclcNum]+[url]+[author]+[publisher]+[physDesc]+[encoding]) |
| 84 | + f.writerow(['']+['']+['']+['']+['']+['']+['']+['']+['']+['']) |
| 85 | + else: |
| 86 | + f2.writerow([bibNumber]+[searchTitle]+[searchDate]) |
0 commit comments