Skip to content

Commit 93df58e

Browse files
author
ehanson8
committed
updates
1 parent 7b34363 commit 93df58e

File tree

3 files changed

+89
-2
lines changed

3 files changed

+89
-2
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,4 +50,5 @@ secrets.py
5050
data/*
5151
!data/.keep
5252
.profile
53-
*.csv
53+
*.csv
54+
*.xslx

oclcTitleDateSearch.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
import csv
4+
import secrets
5+
import urllib
6+
import re
7+
8+
baseURL = 'http://www.worldcat.org/webservices/catalog/search/opensearch?q='
9+
baseURL2 = 'http://www.worldcat.org/webservices/catalog/content/'
10+
wskey = secrets.wskey
11+
f=csv.writer(open('oclcTitleDateSearchMatches.csv', 'wb'))
12+
f.writerow(['bibNumber']+['searchTitle']+['searchDate']+['searchType']+['oclcTitle']+['date']+['oclcNum']+['url']+['author']+['publisher']+['physDesc']+['encoding'])
13+
f2=csv.writer(open('oclcTitleDateSearchNonMatches.csv', 'wb'))
14+
f2.writerow(['bibNumber']+['searchTitle']+['searchDate'])
15+
with open('oclcRecordsTitle.csv') as csvfile:
16+
reader = csv.DictReader(csvfile)
17+
for row in reader:
18+
bibNumber = row['bib#']
19+
print bibNumber
20+
searchType = 'date & title'
21+
searchTitle = row['245 - all subfields'][2:]
22+
if row['260 - all subfields'] != '':
23+
if 'c' in row['260 - all subfields']:
24+
searchDate = row['260 - all subfields']
25+
searchDate = searchDate[searchDate.index('c')+2:].strip()
26+
searchDate = re.sub('[^\d-]+', '', searchDate)
27+
query = 'srw.yr+%3D+"'+searchDate+'"+and+'
28+
else:
29+
query = ''
30+
elif row['260 - all subfields'] != '':
31+
if 'c' in row['264 - all subfields']:
32+
searchDate = row['264 - all subfields']
33+
searchDate = searchDate[searchDate.index('c')+2:].strip()
34+
searchDate = re.sub('[^\d-]+', '', searchDate)
35+
query = 'srw.yr+%3D+"'+searchDate+'"+and+'
36+
else:
37+
query = ''
38+
else:
39+
query = ''
40+
originalTitle = searchTitle
41+
if 'b' in searchTitle:
42+
searchTitle = searchTitle[:searchTitle.index('b')] + ' ' + searchTitle[searchTitle.index('b')+2:]
43+
if 'c' in searchTitle:
44+
searchTitle = searchTitle[:searchTitle.index('c')]
45+
else:
46+
pass
47+
elif 'c' in searchTitle:
48+
searchTitle = searchTitle[:searchTitle.index('c')]
49+
else:
50+
pass
51+
searchTitleURL = urllib.quote(searchTitle).strip()
52+
query = baseURL + query + 'srw.ti+%3D+"'+ searchTitleURL + '"&format=rss&wskey='+wskey
53+
print query
54+
response = requests.get(query).content
55+
records = BeautifulSoup(response, 'lxml').findAll('item')
56+
if records != []:
57+
for record in records:
58+
oclcTitle = record.find('title').text.encode('utf-8')
59+
url = record.find('guid').text.encode('utf-8')
60+
oclcNum = url.replace('http://worldcat.org/oclc/','')
61+
author = record.find('author').find('name').text.encode('utf-8')
62+
response2 = requests.get(baseURL2+oclcNum+'?servicelevel=full&classificationScheme=LibraryOfCongress&wskey='+wskey).content
63+
record2 = BeautifulSoup(response2, "lxml").find('record')
64+
encoding = record2.find('leader').text[17].encode('utf-8')
65+
type = record2.find('controlfield', {'tag' : '008'}).text[23:24]
66+
date = record2.find('controlfield', {'tag' : '008'}).text[7:11].encode('utf-8')
67+
try:
68+
publisher = record2.find('datafield', {'tag' : '260'}).find('subfield', {'code' : 'b'}).text.encode('utf-8')
69+
except:
70+
try:
71+
publisher = record2.find('datafield', {'tag' : '264'}).find('subfield', {'code' : 'b'}).text.encode('utf-8')
72+
except:
73+
publisher = ''
74+
try:
75+
catLang = record2.find('datafield', {'tag' : '040'}).find('subfield', {'code' : 'b'}).text.encode('utf-8')
76+
except:
77+
catLang = ''
78+
try:
79+
physDesc = record2.find('datafield', {'tag' : '300'}).find('subfield', {'code' : 'a'}).text.encode('utf-8')
80+
except:
81+
physDesc = ''
82+
if type == ' ' and (catLang == 'eng' or catLang == ''):
83+
f.writerow([bibNumber]+[searchTitle]+[searchDate]+[searchType]+[oclcTitle]+[date]+[oclcNum]+[url]+[author]+[publisher]+[physDesc]+[encoding])
84+
f.writerow(['']+['']+['']+['']+['']+['']+['']+['']+['']+[''])
85+
else:
86+
f2.writerow([bibNumber]+[searchTitle]+[searchDate])

oclcTitlePhraseEnhanced.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
url = record.find('guid').text.encode('utf-8')
3838
oclcNum = url.replace('http://worldcat.org/oclc/','')
3939
author = record.find('author').find('name').text.encode('utf-8')
40-
response2 = requests.get(baseURL2+url+'?servicelevel=full&classificationScheme=LibraryOfCongress&wskey='+wskey).content
40+
response2 = requests.get(baseURL2+oclcNum+'?servicelevel=full&classificationScheme=LibraryOfCongress&wskey='+wskey).content
4141
record2 = BeautifulSoup(response2, "lxml").find('record')
4242
encoding = record2.find('leader').text[17].encode('utf-8')
4343
type = record2.find('controlfield', {'tag' : '008'}).text[23:24]

0 commit comments

Comments
 (0)