forked from mpeel/wikicode
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcommons_import_interwiki.py
96 lines (88 loc) · 2.58 KB
/
commons_import_interwiki.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Import commons sitelinks based on interwikis on Commons
# Mike Peel 14-Aug-2020 v1 - start
from __future__ import unicode_literals
import pywikibot
import numpy as np
import time
import string
from pywikibot import pagegenerators
import urllib
import time
import pprint
import csv
from pibot_functions import *
def prettyPrint(variable):
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(variable)
wikidata_site = pywikibot.Site("wikidata", "wikidata")
repo = wikidata_site.data_repository() # this is a DataSite object
commons = pywikibot.Site('commons', 'commons')
languages = ['en','de','fr','es','pt','it','nl','pl','ru']
for language in languages:
enwiki = pywikibot.Site(language, 'wikipedia')
regex = 'insource:/"[['+language+'"/'
generator = pagegenerators.SearchPageGenerator(regex, site=commons, namespaces=[14])
gen = pagegenerators.PreloadingGenerator(generator)
count = 0
for category in gen:
print(str(count) + ' - ' + category.title())
count += 1
try:
wd_item = pywikibot.ItemPage.fromPage(category)
item_dict = wd_item.get()
qid = wd_item.title()
print('We already have a sitelink!')
continue
except:
print('Category does not have a current sitelink')
# continue
enwp = ''
for iw in category.interwiki():
try:
print(iw)
if 'wikipedia:'+language in str(iw):
enwp = str(iw).replace('[[wikipedia:'+language+':','').replace(']]','')
except:
continue
if enwp == '':
continue
page = pywikibot.Page(enwiki, enwp)
try:
wd_item = pywikibot.ItemPage.fromPage(page)
print(wd_item)
item_dict = wd_item.get()
except:
print('Huh - no page found')
continue
try:
existing_id = item_dict['claims']['P910']
print('P910 exists, following that.')
for clm2 in existing_id:
wd_item = clm2.getTarget()
item_dict = wd_item.get()
print(wd_item.title())
except:
print('P910 not found')
try:
sitelink = get_sitelink_title(item_dict['sitelinks']['commonswiki'])
print('Has sitelink')
except:
# No existing sitelink found, add the new one
data = {'sitelinks': [{'site': 'commonswiki', 'title': category.title()}]}
print("\n\n")
# prettyPrint(item_dict)
# print(data)
print('https://commons.wikimedia.org/wiki/'+category.title().replace(' ','_'))
print('https://www.wikidata.org/wiki/'+str(wd_item.title()))
try:
# text = input("Save? ")
# if text == 'y':
wd_item.editEntity(data, summary=u'Add commons sitelink based on interwiki on Commons')
# continue
# else:
# continue
except:
print('Edit failed')
# EOF