Skip to content

Commit 7b34363

Browse files
author
ehanson8
committed
updates
1 parent a09c2ad commit 7b34363

File tree

5 files changed

+196
-30
lines changed

5 files changed

+196
-30
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,4 +49,5 @@ secrets.py
4949
*.pyc
5050
data/*
5151
!data/.keep
52-
.profile
52+
.profile
53+
*.csv

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ More information about WSKeys is available [here](https://www.oclc.org/developer
88
#### [oclcIsbn.py](oclcIsbn.py)
99
This script retrieves OCLC numbers and titles based on a text file of ISBNs.
1010

11+
#### [oclcTitleBorrowDirect.py](oclcTitleBorrowDirect.py)
12+
This script retrieves OCLC data based on a CSV from the BorrowDirect Data Repository (Beta), on the Penn Library Data Farm.
13+
1114
#### [oclcTitlePhraseEnhanced.py](oclcTitlePhraseEnhanced.py)
1215
This script retrieves OCLC records based on a text file of titles and extracts the title, URL, author, publisher, encoding level, language, and date.
1316

oclcSearchForNewNum.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
import csv
4+
import secrets
5+
import time
6+
7+
startTime = time.time()
8+
9+
baseURL = 'http://www.worldcat.org/webservices/catalog/content/'
10+
wskey = secrets.wskey
11+
f=csv.writer(open('newOclcNumResults.csv', 'wb'))
12+
f.writerow(['bibNum']+['search']+['newOclcNum'])
13+
14+
filename = 'noHathiTrustMatch.csv'
15+
16+
with open(filename) as csvfile:
17+
reader = csv.DictReader(csvfile)
18+
counter = 0
19+
for row in reader:
20+
counter = counter + 1
21+
print counter
22+
search = row['oclcNum']
23+
bibNum = row['bibNum']
24+
try:
25+
response = requests.get(baseURL+search.strip()+'?wskey='+wskey).content
26+
record = BeautifulSoup(response, "lxml").find('record')
27+
oclcNum = record.find('controlfield', {'tag' : '001'}).text.lstrip('0')
28+
except:
29+
fullTitle = ''
30+
oclcNum = ''
31+
if search.lstrip('0') != oclcNum:
32+
print search, oclcNum
33+
f.writerow([bibNum]+[search]+[oclcNum])
34+
else:
35+
oclcNum = ''
36+
f.writerow([bibNum]+[search]+[oclcNum])
37+
38+
elapsedTime = time.time() - startTime
39+
m, s = divmod(elapsedTime, 60)
40+
h, m = divmod(m, 60)
41+
print 'Total script run time: ', '%d:%02d:%02d' % (h, m, s)

oclcTitlePhraseBorrowDirect.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
import csv
4+
import secrets
5+
import urllib
6+
import re
7+
import time
8+
9+
startTime = time.time()
10+
11+
fileName = raw_input('Enter file name: ')
12+
fileNameWithoutExtension = fileName[:fileName.index('.')]
13+
14+
baseURL = 'http://www.worldcat.org/webservices/catalog/search/opensearch?q='
15+
baseURL2 = 'http://www.worldcat.org/webservices/catalog/content/'
16+
17+
wskey = secrets.wskey
18+
f=csv.writer(open(fileNameWithoutExtension+'oclcSearchMatches.csv', 'wb'))
19+
f.writerow(['searchOclcNum']+['borrower']+['lender']+['status']+['patronType']+['isbn']+['searchTitle']+['searchAuthor']+['searchDate']+['oclcNum']+['oclcTitle']+['oclcAuthor']+['oclcPublisher']+['callNumLetters']+['callNumFull']+['physDesc']+['oclcDate'])
20+
f2=csv.writer(open(fileNameWithoutExtension+'oclcSearchNonMatches.csv', 'wb'))
21+
f2.writerow(['searchOoclcNum']+['borrower']+['lender']+['status']+['patronType']+['isbn']+['searchTitle']+['searchAuthor']+['searchDate'])
22+
with open(fileName) as csvfile:
23+
reader = csv.DictReader(csvfile)
24+
for row in reader:
25+
borrower = row['BORROWER']
26+
lender = row['LENDER']
27+
status = row['STATUS']
28+
patronType = row['PATRON TYPE']
29+
searchOclcNum = row['OCLC']
30+
print searchOclcNum
31+
isbn = row['ISBN']
32+
searchAuthor = row['AUTHOR']
33+
searchTitle = row['TITLE']
34+
searchPublisher = row['PUBLISHER']
35+
searchDate = row['PUBLICATION YEAR']
36+
try:
37+
response = requests.get('http://www.worldcat.org/webservices/catalog/content/'+searchOclcNum+'?format=rss&wskey='+wskey).content
38+
record = BeautifulSoup(response, "lxml").find('record')
39+
oclcNum = record.find('controlfield', {'tag' : '001'}).text
40+
except:
41+
originalTitle = searchTitle
42+
search = urllib.quote(searchTitle)
43+
print search
44+
response = requests.get(baseURL+search.strip()+'&count=1&format=rss&wskey='+wskey).content
45+
record = BeautifulSoup(response, "lxml").findAll('item')
46+
if record != []:
47+
record = record[0]
48+
url = record.find('guid').text.encode('utf-8')
49+
oclcNum = url.replace('http://worldcat.org/oclc/','')
50+
oclcAuthor = record.find('author').find('name').text.encode('utf-8')
51+
52+
response2 = requests.get(baseURL2+oclcNum+'?servicelevel=full&classificationScheme=LibraryOfCongress&wskey='+wskey).content
53+
try:
54+
record2 = BeautifulSoup(response2, "lxml").find('record')
55+
try:
56+
titleA = record2.find('datafield', {'tag' : '245'}).find('subfield', {'code' : 'a'}).text.encode('utf-8')
57+
except:
58+
titleA = ''
59+
try:
60+
titleB = record2.find('datafield', {'tag' : '245'}).find('subfield', {'code' : 'b'}).text.encode('utf-8')
61+
except:
62+
titleB = ''
63+
oclcTitle = titleA + ' ' + titleB
64+
oclcDate = record2.find('controlfield', {'tag' : '008'}).text[7:11].encode('utf-8')
65+
try:
66+
callNumFullA = record2.find('datafield', {'tag' : '050'}).find('subfield', {'code' : 'a'}).text.encode('utf-8')
67+
numStart = re.search('\d', callNumFullA)
68+
callNumLetters = callNumFullA[:numStart.start()]
69+
except:
70+
callNumFullA = ''
71+
callNumLetters = ''
72+
try:
73+
callNumFullB = record2.find('datafield', {'tag' : '050'}).find('subfield', {'code' : 'b'}).text.encode('utf-8')
74+
except:
75+
callNumFullB = ''
76+
callNumFull = callNumFullA + ' ' + callNumFullB
77+
try:
78+
oclcPublisher = record2.find('datafield', {'tag' : '260'}).find('subfield', {'code' : 'b'}).text.encode('utf-8')
79+
except:
80+
try:
81+
oclcPublisher = record2.find('datafield', {'tag' : '264'}).find('subfield', {'code' : 'b'}).text.encode('utf-8')
82+
except:
83+
oclcPublisher = ''
84+
try:
85+
physDesc = record2.find('datafield', {'tag' : '300'}).find('subfield', {'code' : 'a'}).text.encode('utf-8')
86+
except:
87+
physDesc = ''
88+
f.writerow([searchOclcNum]+[borrower]+[lender]+[status]+[patronType]+[isbn]+[searchTitle]+[searchAuthor]+[searchDate]+[oclcNum]+[oclcTitle]+[oclcAuthor]+[oclcPublisher]+[callNumLetters]+[callNumFull]+[physDesc]+[oclcDate])
89+
oclcNum = ''
90+
oclcTitle = ''
91+
oclcAuthor = ''
92+
callNumLetters = ''
93+
callNumFull = ''
94+
oclcPublisher = ''
95+
oclcDate = ''
96+
except:
97+
f2.writerow([searchOclcNum]+[borrower]+[lender]+[status]+[patronType]+[isbn]+[searchTitle]+[searchAuthor]+[searchDate])
98+
99+
elapsedTime = time.time() - startTime
100+
m, s = divmod(elapsedTime, 60)
101+
h, m = divmod(m, 60)
102+
print 'Total script run time: ', '%d:%02d:%02d' % (h, m, s)

oclcTitlePhraseEnhanced.py

Lines changed: 48 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2,44 +2,63 @@
22
from bs4 import BeautifulSoup
33
import csv
44
import secrets
5+
import urllib
56

67
baseURL = 'http://www.worldcat.org/webservices/catalog/search/opensearch?q='
78
baseURL2 = 'http://www.worldcat.org/webservices/catalog/content/'
89
wskey = secrets.wskey
9-
f=csv.writer(open('resultsTitle.csv', 'wb'))
10-
f.writerow(['bibNumber']+['searchTitle']+['oclcTitle']+['url']+['author']+['publisher']+['encoding']+['lang']+['date'])
11-
with open('oclcRecordsTitle.txt') as txt:
12-
for row in txt:
13-
bibNumber = row[:row.index('|')]
14-
searchTitle = row[row.index('|')+1:]
15-
search = searchTitle.replace(' ','%20')
10+
f=csv.writer(open('oclcTitleSearchMatches.csv', 'wb'))
11+
f.writerow(['bibNumber']+['searchTitle']+['oclcTitle']+['oclcNum']+['url']+['author']+['publisher']+['physDesc']+['encoding']+['date'])
12+
f2=csv.writer(open('oclcTitleSearchNonMatches.csv', 'wb'))
13+
f2.writerow(['bibNumber']+['searchTitle'])
14+
with open('oclcRecordsTitle.csv') as csvfile:
15+
reader = csv.DictReader(csvfile)
16+
for row in reader:
17+
bibNumber = row['bib#']
18+
print bibNumber
19+
searchTitle = row['245 - all subfields'][2:]
20+
originalTitle = searchTitle
21+
if 'b' in searchTitle:
22+
searchTitle = searchTitle[:searchTitle.index('b')] + ' ' + searchTitle[searchTitle.index('b')+2:]
23+
if 'c' in searchTitle:
24+
searchTitle = searchTitle[:searchTitle.index('c')]
25+
else:
26+
pass
27+
elif 'c' in searchTitle:
28+
searchTitle = searchTitle[:searchTitle.index('c')]
29+
else:
30+
pass
31+
search = urllib.quote(searchTitle)
1632
response = requests.get(baseURL+search.strip()+'&format=rss&wskey='+wskey).content
1733
records = BeautifulSoup(response, "lxml").findAll('item')
18-
for record in records:
19-
try:
20-
title = record.find('title').text.encode('utf-8')
34+
if records != []:
35+
for record in records:
36+
oclcTitle = record.find('title').text.encode('utf-8')
2137
url = record.find('guid').text.encode('utf-8')
38+
oclcNum = url.replace('http://worldcat.org/oclc/','')
2239
author = record.find('author').find('name').text.encode('utf-8')
23-
except:
24-
title = ''
25-
url = ''
26-
author = ''
27-
recordNumber = url.replace('http://worldcat.org/oclc/','')
28-
response2 = requests.get(baseURL2+recordNumber+'?classificationScheme=LibraryOfCongress&wskey='+wskey).content
29-
record2 = BeautifulSoup(response2, "lxml").find('record')
30-
try:
40+
response2 = requests.get(baseURL2+url+'?servicelevel=full&classificationScheme=LibraryOfCongress&wskey='+wskey).content
41+
record2 = BeautifulSoup(response2, "lxml").find('record')
3142
encoding = record2.find('leader').text[17].encode('utf-8')
43+
type = record2.find('controlfield', {'tag' : '008'}).text[23:24]
3244
date = record2.find('controlfield', {'tag' : '008'}).text[7:11].encode('utf-8')
33-
lang = record2.find('controlfield', {'tag' : '008'}).text[35:38].encode('utf-8')
34-
except:
35-
date = ''
36-
lang = ''
37-
encoding = ''
38-
try:
39-
publisher = record2.find('datafield', {'tag' : '260'}).find('subfield', {'code' : 'b'}).text.encode('utf-8')
40-
except:
4145
try:
42-
publisher = record2.find('datafield', {'tag' : '264'}).find('subfield', {'code' : 'b'}).text.encode('utf-8')
46+
publisher = record2.find('datafield', {'tag' : '260'}).find('subfield', {'code' : 'b'}).text.encode('utf-8')
4347
except:
44-
publisher = ''
45-
f.writerow([bibNumber]+[searchTitle]+[title]+[url]+[author]+[publisher]+[encoding]+[lang]+[date])
48+
try:
49+
publisher = record2.find('datafield', {'tag' : '264'}).find('subfield', {'code' : 'b'}).text.encode('utf-8')
50+
except:
51+
publisher = ''
52+
try:
53+
catLang = record2.find('datafield', {'tag' : '040'}).find('subfield', {'code' : 'b'}).text.encode('utf-8')
54+
except:
55+
catLang = ''
56+
try:
57+
physDesc = record2.find('datafield', {'tag' : '300'}).find('subfield', {'code' : 'a'}).text.encode('utf-8')
58+
except:
59+
physDesc = ''
60+
if type == ' ' and (catLang == 'eng' or catLang == ''):
61+
f.writerow([bibNumber]+[searchTitle]+[oclcTitle]+[oclcNum]+[url]+[author]+[publisher]+[physDesc]+[encoding]+[date])
62+
f.writerow(['']+['']+['']+['']+['']+['']+['']+['']+['']+[''])
63+
else:
64+
f2.writerow([bibNumber]+[searchTitle])

0 commit comments

Comments
 (0)