|
2 | 2 | from bs4 import BeautifulSoup |
3 | 3 | import csv |
4 | 4 | import secrets |
| 5 | +import urllib |
5 | 6 |
|
6 | 7 | baseURL = 'http://www.worldcat.org/webservices/catalog/search/opensearch?q=' |
7 | 8 | baseURL2 = 'http://www.worldcat.org/webservices/catalog/content/' |
8 | 9 | wskey = secrets.wskey |
9 | | -f=csv.writer(open('resultsTitle.csv', 'wb')) |
10 | | -f.writerow(['bibNumber']+['searchTitle']+['oclcTitle']+['url']+['author']+['publisher']+['encoding']+['lang']+['date']) |
11 | | -with open('oclcRecordsTitle.txt') as txt: |
12 | | - for row in txt: |
13 | | - bibNumber = row[:row.index('|')] |
14 | | - searchTitle = row[row.index('|')+1:] |
15 | | - search = searchTitle.replace(' ','%20') |
| 10 | +f=csv.writer(open('oclcTitleSearchMatches.csv', 'wb')) |
| 11 | +f.writerow(['bibNumber']+['searchTitle']+['oclcTitle']+['oclcNum']+['url']+['author']+['publisher']+['physDesc']+['encoding']+['date']) |
| 12 | +f2=csv.writer(open('oclcTitleSearchNonMatches.csv', 'wb')) |
| 13 | +f2.writerow(['bibNumber']+['searchTitle']) |
| 14 | +with open('oclcRecordsTitle.csv') as csvfile: |
| 15 | + reader = csv.DictReader(csvfile) |
| 16 | + for row in reader: |
| 17 | + bibNumber = row['bib#'] |
| 18 | + print bibNumber |
| 19 | + searchTitle = row['245 - all subfields'][2:] |
| 20 | + originalTitle = searchTitle |
| 21 | + if 'b' in searchTitle: |
| 22 | + searchTitle = searchTitle[:searchTitle.index('b')] + ' ' + searchTitle[searchTitle.index('b')+2:] |
| 23 | + if 'c' in searchTitle: |
| 24 | + searchTitle = searchTitle[:searchTitle.index('c')] |
| 25 | + else: |
| 26 | + pass |
| 27 | + elif 'c' in searchTitle: |
| 28 | + searchTitle = searchTitle[:searchTitle.index('c')] |
| 29 | + else: |
| 30 | + pass |
| 31 | + search = urllib.quote(searchTitle) |
16 | 32 | response = requests.get(baseURL+search.strip()+'&format=rss&wskey='+wskey).content |
17 | 33 | records = BeautifulSoup(response, "lxml").findAll('item') |
18 | | - for record in records: |
19 | | - try: |
20 | | - title = record.find('title').text.encode('utf-8') |
| 34 | + if records != []: |
| 35 | + for record in records: |
| 36 | + oclcTitle = record.find('title').text.encode('utf-8') |
21 | 37 | url = record.find('guid').text.encode('utf-8') |
| 38 | + oclcNum = url.replace('http://worldcat.org/oclc/','') |
22 | 39 | author = record.find('author').find('name').text.encode('utf-8') |
23 | | - except: |
24 | | - title = '' |
25 | | - url = '' |
26 | | - author = '' |
27 | | - recordNumber = url.replace('http://worldcat.org/oclc/','') |
28 | | - response2 = requests.get(baseURL2+recordNumber+'?classificationScheme=LibraryOfCongress&wskey='+wskey).content |
29 | | - record2 = BeautifulSoup(response2, "lxml").find('record') |
30 | | - try: |
| 40 | + response2 = requests.get(baseURL2+url+'?servicelevel=full&classificationScheme=LibraryOfCongress&wskey='+wskey).content |
| 41 | + record2 = BeautifulSoup(response2, "lxml").find('record') |
31 | 42 | encoding = record2.find('leader').text[17].encode('utf-8') |
| 43 | + type = record2.find('controlfield', {'tag' : '008'}).text[23:24] |
32 | 44 | date = record2.find('controlfield', {'tag' : '008'}).text[7:11].encode('utf-8') |
33 | | - lang = record2.find('controlfield', {'tag' : '008'}).text[35:38].encode('utf-8') |
34 | | - except: |
35 | | - date = '' |
36 | | - lang = '' |
37 | | - encoding = '' |
38 | | - try: |
39 | | - publisher = record2.find('datafield', {'tag' : '260'}).find('subfield', {'code' : 'b'}).text.encode('utf-8') |
40 | | - except: |
41 | 45 | try: |
42 | | - publisher = record2.find('datafield', {'tag' : '264'}).find('subfield', {'code' : 'b'}).text.encode('utf-8') |
| 46 | + publisher = record2.find('datafield', {'tag' : '260'}).find('subfield', {'code' : 'b'}).text.encode('utf-8') |
43 | 47 | except: |
44 | | - publisher = '' |
45 | | - f.writerow([bibNumber]+[searchTitle]+[title]+[url]+[author]+[publisher]+[encoding]+[lang]+[date]) |
| 48 | + try: |
| 49 | + publisher = record2.find('datafield', {'tag' : '264'}).find('subfield', {'code' : 'b'}).text.encode('utf-8') |
| 50 | + except: |
| 51 | + publisher = '' |
| 52 | + try: |
| 53 | + catLang = record2.find('datafield', {'tag' : '040'}).find('subfield', {'code' : 'b'}).text.encode('utf-8') |
| 54 | + except: |
| 55 | + catLang = '' |
| 56 | + try: |
| 57 | + physDesc = record2.find('datafield', {'tag' : '300'}).find('subfield', {'code' : 'a'}).text.encode('utf-8') |
| 58 | + except: |
| 59 | + physDesc = '' |
| 60 | + if type == ' ' and (catLang == 'eng' or catLang == ''): |
| 61 | + f.writerow([bibNumber]+[searchTitle]+[oclcTitle]+[oclcNum]+[url]+[author]+[publisher]+[physDesc]+[encoding]+[date]) |
| 62 | + f.writerow(['']+['']+['']+['']+['']+['']+['']+['']+['']+['']) |
| 63 | + else: |
| 64 | + f2.writerow([bibNumber]+[searchTitle]) |
0 commit comments