Skip to content

Commit 828da2a

Browse files
author
ehanson8
committed
add scripts
1 parent a6c2f4e commit 828da2a

11 files changed

+722
-0
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,4 @@ $RECYCLE.BIN/
4545
Network Trash Folder
4646
Temporary Items
4747
.apdisk
48+
secrets.py

compareTwoKeysInCommunity.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
import json
2+
import requests
3+
import secrets
4+
import csv
5+
import time
6+
7+
baseURL = secrets.baseURL
8+
email = secrets.email
9+
password = secrets.password
10+
filePath = secrets.filePath
11+
12+
communityID = raw_input('Enter community ID: ')
13+
key = raw_input('Enter first key: ')
14+
key2 = raw_input('Enter second key: ')
15+
16+
startTime = time.time()
17+
data = json.dumps({'email':email,'password':password})
18+
header = {'content-type':'application/json','accept':'application/json'}
19+
session = requests.post(baseURL+'/rest/login', headers=header, data=data).content
20+
headerAuth = {'content-type':'application/json','accept':'application/json', 'rest-dspace-token':session}
21+
print 'authenticated'
22+
23+
itemList = []
24+
endpoint = baseURL+'/rest/communities'
25+
collections = requests.get(baseURL+'/rest/communities/'+str(communityID)+'/collections', headers=headerAuth).json()
26+
for j in range (0, len (collections)):
27+
collectionID = collections[j]['id']
28+
if collectionID != 24:
29+
offset = 0
30+
items = ''
31+
while items != []:
32+
items = requests.get(baseURL+'/rest/collections/'+str(collectionID)+'/items?limit=1000&offset='+str(offset), headers=headerAuth)
33+
while items.status_code != 200:
34+
time.sleep(5)
35+
items = requests.get(baseURL+'/rest/collections/'+str(collectionID)+'/items?limit=1000&offset='+str(offset), headers=headerAuth)
36+
items = items.json()
37+
for k in range (0, len (items)):
38+
itemID = items[k]['id']
39+
itemList.append(itemID)
40+
offset = offset + 1000
41+
elapsedTime = time.time() - startTime
42+
m, s = divmod(elapsedTime, 60)
43+
h, m = divmod(m, 60)
44+
print 'Item list creation time: ','%d:%02d:%02d' % (h, m, s)
45+
46+
valueList = []
47+
for number, itemID in enumerate(itemList):
48+
itemsRemaining = len(itemList) - number
49+
print 'Items remaining: ', itemsRemaining, 'ItemID: ', itemID
50+
metadata = requests.get(baseURL+'/rest/items/'+str(itemID)+'/metadata', headers=headerAuth).json()
51+
itemTuple = (itemID,)
52+
tupleValue1 = ''
53+
tupleValue2 = ''
54+
for l in range (0, len (metadata)):
55+
if metadata[l]['key'] == key:
56+
metadataValue = metadata[l]['value']
57+
tupleValue1 = metadataValue
58+
if metadata[l]['key'] == key2:
59+
metadataValue = metadata[l]['value']
60+
tupleValue2 = metadataValue
61+
itemTuple = itemTuple + (tupleValue1 , tupleValue2)
62+
valueList.append(itemTuple)
63+
print itemTuple
64+
print valueList
65+
66+
elapsedTime = time.time() - startTime
67+
m, s = divmod(elapsedTime, 60)
68+
h, m = divmod(m, 60)
69+
print 'Value list creation time: ','%d:%02d:%02d' % (h, m, s)
70+
71+
f=csv.writer(open(filePath+key+'-'+key2+'Values.csv', 'wb'))
72+
f.writerow(['itemID']+[key]+[key2])
73+
for i in range (0, len (valueList)):
74+
f.writerow([valueList[i][0]]+[valueList[i][1]]+[valueList[i][2]])
75+
76+
logout = requests.post(baseURL+'/rest/logout', headers=headerAuth)
77+
78+
elapsedTime = time.time() - startTime
79+
m, s = divmod(elapsedTime, 60)
80+
h, m = divmod(m, 60)
81+
print 'Total script run time: ', '%d:%02d:%02d' % (h, m, s)

findBogusUris.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import json
2+
import requests
3+
import secrets
4+
import csv
5+
import time
6+
7+
baseURL = secrets.baseURL
8+
email = secrets.email
9+
password = secrets.password
10+
filePath = secrets.filePath
11+
handlePrefix = secrets.handlePrefix
12+
13+
startTime = time.time()
14+
data = json.dumps({'email':email,'password':password})
15+
header = {'content-type':'application/json','accept':'application/json'}
16+
session = requests.post(baseURL+'/rest/login', headers=header, data=data).content
17+
headerAuth = {'content-type':'application/json','accept':'application/json', 'rest-dspace-token':session}
18+
print 'authenticated'
19+
20+
itemList = []
21+
endpoint = baseURL+'/rest/communities'
22+
communities = requests.get(endpoint, headers=headerAuth).json()
23+
for i in range (0, len (communities)):
24+
communityID = communities[i]['id']
25+
collections = requests.get(baseURL+'/rest/communities/'+str(communityID)+'/collections', headers=headerAuth).json()
26+
for j in range (0, len (collections)):
27+
collectionID = collections[j]['id']
28+
if collectionID != 24:
29+
offset = 0
30+
items = ''
31+
while items != []:
32+
items = requests.get(baseURL+'/rest/collections/'+str(collectionID)+'/items?limit=1000&offset='+str(offset), headers=headerAuth)
33+
while items.status_code != 200:
34+
time.sleep(5)
35+
items = requests.get(baseURL+'/rest/collections/'+str(collectionID)+'/items?limit=1000&offset='+str(offset), headers=headerAuth)
36+
items = items.json()
37+
for k in range (0, len (items)):
38+
itemID = items[k]['id']
39+
itemList.append(itemID)
40+
offset = offset + 1000
41+
elapsedTime = time.time() - startTime
42+
m, s = divmod(elapsedTime, 60)
43+
h, m = divmod(m, 60)
44+
print 'Item list creation time: ','%d:%02d:%02d' % (h, m, s)
45+
46+
47+
f=csv.writer(open(filePath+'bogusUris.csv', 'wb'))
48+
f.writerow(['itemID']+['uri'])
49+
for number, itemID in enumerate(itemList):
50+
itemsRemaining = len(itemList) - number
51+
print 'Items remaining: ', itemsRemaining, 'ItemID: ', itemID
52+
metadata = requests.get(baseURL+'/rest/items/'+str(itemID)+'/metadata', headers=headerAuth).json()
53+
for l in range (0, len (metadata)):
54+
if metadata[l]['key'] == 'dc.identifier.uri':
55+
uri = str(metadata[l]['value'])
56+
if uri.startswith(handlePrefix) == False:
57+
f.writerow([itemID]+[uri])
58+
59+
logout = requests.post(baseURL+'/rest/logout', headers=headerAuth)
60+
61+
elapsedTime = time.time() - startTime
62+
m, s = divmod(elapsedTime, 60)
63+
h, m = divmod(m, 60)
64+
print 'Total script run time: ', '%d:%02d:%02d' % (h, m, s)

findDuplicateKeys.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import json
2+
import requests
3+
import secrets
4+
import time
5+
import csv
6+
7+
baseURL = secrets.baseURL
8+
email = secrets.email
9+
password = secrets.password
10+
filePath = secrets.filePath
11+
12+
key = raw_input('Enter key: ')
13+
searchString = "\""+key+"\""
14+
15+
startTime = time.time()
16+
data = json.dumps({'email':email,'password':password})
17+
header = {'content-type':'application/json','accept':'application/json'}
18+
session = requests.post(baseURL+'/rest/login', headers=header, data=data).content
19+
headerAuth = {'content-type':'application/json','accept':'application/json', 'rest-dspace-token':session}
20+
print 'authenticated'
21+
22+
23+
itemList = []
24+
endpoint = baseURL+'/rest/communities'
25+
communities = requests.get(endpoint, headers=headerAuth).json()
26+
for i in range (0, len (communities)):
27+
communityID = communities[i]['id']
28+
collections = requests.get(baseURL+'/rest/communities/'+str(communityID)+'/collections', headers=headerAuth).json()
29+
for j in range (0, len (collections)):
30+
collectionID = collections[j]['id']
31+
if collectionID != 24:
32+
items = requests.get(baseURL+'/rest/collections/'+str(collectionID)+'/items?limit=100000', headers=headerAuth)
33+
while items.status_code != 200:
34+
time.sleep(5)
35+
items = requests.get(baseURL+'/rest/collections/'+str(collectionID)+'/items?limit=100000', headers=headerAuth)
36+
items = items.json()
37+
for k in range (0, len (items)):
38+
itemID = items[k]['id']
39+
itemList.append(itemID)
40+
elapsedTime = time.time() - startTime
41+
m, s = divmod(elapsedTime, 60)
42+
h, m = divmod(m, 60)
43+
print 'Item list creation time: ','%d:%02d:%02d' % (h, m, s)
44+
45+
f=csv.writer(open(filePath+'recordsWithDuplicate'+key+'.csv', 'wb'))
46+
f.writerow(['itemID'])
47+
for number, itemID in enumerate(itemList):
48+
itemsRemaining = len(itemList) - number
49+
print 'Items remaining: ', itemsRemaining, 'ItemID: ', itemID
50+
metadata = requests.get(baseURL+'/rest/items/'+str(itemID)+'/metadata', headers=headerAuth).json()
51+
metadata = json.dumps(metadata)
52+
if metadata.find(searchString) != metadata.rfind(searchString):
53+
f.writerow([itemID])
54+
55+
logout = requests.post(baseURL+'/rest/logout', headers=headerAuth)
56+
57+
elapsedTime = time.time() - startTime
58+
m, s = divmod(elapsedTime, 60)
59+
h, m = divmod(m, 60)
60+
print 'Total script run time: ', '%d:%02d:%02d' % (h, m, s)

getCollectionMetadataJson.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import json
2+
import requests
3+
import secrets
4+
import time
5+
6+
baseURL = secrets.baseURL
7+
email = secrets.email
8+
password = secrets.password
9+
filePath = secrets.filePath
10+
11+
handle = raw_input('Enter handle: ')
12+
13+
data = json.dumps({'email':email,'password':password})
14+
header = {'content-type':'application/json','accept':'application/json'}
15+
session = requests.post(baseURL+'/rest/login', headers=header, data=data).content
16+
headerAuth = {'content-type':'application/json','accept':'application/json', 'rest-dspace-token':session}
17+
print 'authenticated'
18+
startTime = time.time()
19+
20+
endpoint = baseURL+'/rest/handle/'+handle
21+
collection = requests.get(endpoint, headers=headerAuth).json()
22+
collectionID = collection['id']
23+
collectionTitle = requests.get(endpoint, headers=headerAuth).json()
24+
endpoint = baseURL+'/rest/collections/'+str(collectionID)+'/items'
25+
output = requests.get(endpoint, headers=headerAuth).json()
26+
27+
itemList = []
28+
for i in range (0, len (output)):
29+
name = output[i]['name']
30+
itemID = output[i]['id']
31+
itemList.append(itemID)
32+
33+
f=open(filePath+handle.replace('/','-')+'.json', 'w')
34+
metadataGroup = []
35+
for itemID in itemList:
36+
metadata = requests.get(baseURL+'/rest/items/'+str(itemID)+'/metadata', headers=headerAuth).json()
37+
metadataGroup.append(metadata)
38+
json.dump(metadataGroup, f)
39+
40+
logout = requests.post(baseURL+'/rest/logout', headers=headerAuth)
41+
42+
elapsedTime = time.time() - startTime
43+
m, s = divmod(elapsedTime, 60)
44+
h, m = divmod(m, 60)
45+
print "%d:%02d:%02d" % (h, m, s)
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
import json
2+
import requests
3+
import secrets
4+
import csv
5+
import time
6+
import os.path
7+
from collections import Counter
8+
9+
baseURL = secrets.baseURL
10+
email = secrets.email
11+
password = secrets.password
12+
filePath = secrets.filePath
13+
14+
filePathComplete = filePath+'completeValueLists/'
15+
filePathUnique = filePath+'/uniqueValueLists/'
16+
17+
startTime = time.time()
18+
data = json.dumps({'email':email,'password':password})
19+
header = {'content-type':'application/json','accept':'application/json'}
20+
session = requests.post(baseURL+'/rest/login', headers=header, data=data).content
21+
headerAuth = {'content-type':'application/json','accept':'application/json', 'rest-dspace-token':session}
22+
print 'authenticated'
23+
24+
itemList = []
25+
endpoint = baseURL+'/rest/communities'
26+
communities = requests.get(endpoint, headers=headerAuth).json()
27+
for i in range (0, len (communities)):
28+
communityID = communities[i]['id']
29+
collections = requests.get(baseURL+'/rest/communities/'+str(communityID)+'/collections', headers=headerAuth).json()
30+
for j in range (0, len (collections)):
31+
collectionID = collections[j]['id']
32+
if collectionID != 24:
33+
offset = 0
34+
items = ''
35+
while items != []:
36+
items = requests.get(baseURL+'/rest/collections/'+str(collectionID)+'/items?limit=1000&offset='+str(offset), headers=headerAuth)
37+
while items.status_code != 200:
38+
time.sleep(5)
39+
items = requests.get(baseURL+'/rest/collections/'+str(collectionID)+'/items?limit=1000&offset='+str(offset), headers=headerAuth)
40+
items = items.json()
41+
for k in range (0, len (items)):
42+
itemID = items[k]['id']
43+
itemList.append(itemID)
44+
offset = offset + 1000
45+
elapsedTime = time.time() - startTime
46+
m, s = divmod(elapsedTime, 60)
47+
h, m = divmod(m, 60)
48+
print 'Item list creation time: ','%d:%02d:%02d' % (h, m, s)
49+
50+
for number, itemID in enumerate(itemList):
51+
itemsRemaining = len(itemList) - number
52+
print 'Items remaining: ', itemsRemaining, 'ItemID: ', itemID
53+
metadata = requests.get(baseURL+'/rest/items/'+str(itemID)+'/metadata', headers=headerAuth).json()
54+
for l in range (0, len (metadata)):
55+
if metadata[l]['key'] != 'dc.description.provenance':
56+
key = metadata[l]['key']
57+
value = metadata[l]['value'].encode('utf-8')
58+
if os.path.isfile(filePathComplete+key+'Values.csv') == False:
59+
f=csv.writer(open(filePathComplete+key+'Values.csv', 'wb'))
60+
f.writerow(['itemID']+['value'])
61+
f.writerow([itemID]+[value])
62+
else:
63+
f=csv.writer(open(filePathComplete+key+'Values.csv', 'a'))
64+
f.writerow([itemID]+[value])
65+
66+
elapsedTime = time.time() - startTime
67+
m, s = divmod(elapsedTime, 60)
68+
h, m = divmod(m, 60)
69+
print 'Complete value list creation time: ','%d:%02d:%02d' % (h, m, s)
70+
71+
for fileName in os.listdir(filePathComplete):
72+
reader = csv.DictReader(open(filePathComplete+fileName))
73+
valueList = []
74+
for row in reader:
75+
valueList.append(row['value'])
76+
valueListCount = Counter(valueList)
77+
f=csv.writer(open(filePathUnique+fileName, 'wb'))
78+
f.writerow(['value']+['count'])
79+
for key, value in valueListCount.items():
80+
f.writerow([key]+[str(value).zfill(6)])
81+
82+
logout = requests.post(baseURL+'/rest/logout', headers=headerAuth)
83+
84+
elapsedTime = time.time() - startTime
85+
m, s = divmod(elapsedTime, 60)
86+
h, m = divmod(m, 60)
87+
print 'Total script run time: ', '%d:%02d:%02d' % (h, m, s)

0 commit comments

Comments
 (0)