-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
544 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import cv2 | ||
import os | ||
|
||
# reference: https://my.oschina.net/u/4399904/blog/4237625 | ||
|
||
|
||
def calculate(image1, image2): | ||
hist1 = cv2.calcHist([image1], [0], None, [256], [0.0, 255.0]) | ||
hist2 = cv2.calcHist([image2], [0], None, [256], [0.0, 255.0]) | ||
degree = 0 | ||
for i in range(len(hist1)): | ||
if hist1[i] != hist2[i]: | ||
degree = degree + \ | ||
(1 - abs(hist1[i] - hist2[i]) / max(hist1[i], hist2[i])) | ||
else: | ||
degree = degree + 1 | ||
degree = degree / len(hist1) | ||
return degree | ||
|
||
|
||
|
||
def get_img_similarity(image1, image2, size = (256, 256)): | ||
try: | ||
image1_resized = cv2.resize(image1, size) | ||
image2_resized = cv2.resize(image2, size) | ||
sub_image1 = cv2.split(image1_resized) | ||
sub_image2 = cv2.split(image2_resized) | ||
sub_data = 0 | ||
for im1, im2 in zip(sub_image1, sub_image2): | ||
sub_data += calculate(im1, im2) | ||
sub_data = sub_data / 3 | ||
except: | ||
print(size) | ||
print(image1.shape) | ||
print(image2.shape) | ||
print(image1_resized.shape) | ||
print(image2_resized.shape) | ||
|
||
exit() | ||
return sub_data | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,192 @@ | ||
import pymongo | ||
import pymysql | ||
import progressbar | ||
import math | ||
import multiprocessing | ||
import os | ||
import json | ||
import numpy as np | ||
import cv2 | ||
import pandas as pd | ||
from collections import defaultdict, OrderedDict | ||
from strsimpy.jaro_winkler import JaroWinkler | ||
|
||
jarowinkler = JaroWinkler() | ||
name_sim_threshold = 0.9 # this threshold is selected based on manual evaluation | ||
|
||
def parse_name(string): | ||
return string.lower().replace('-', '').replace('_','').replace(' ','') | ||
|
||
|
||
''' | ||
Required input: | ||
A mysql GHTorrent dump with the username and password to access it | ||
A mongo collection which stores the twitter user information (cralwed by twitter API), which are candidates of possible gh-tw account linking | ||
''' | ||
|
||
''' | ||
The script will insert the identified tw-gh link to another mongo collection | ||
''' | ||
|
||
|
||
# access to mysql database | ||
MYSQL_USER = "" | ||
MYSQL_PASSWORD = "" | ||
MYSQL_DB_NAME = "" | ||
# access to mongo database | ||
MONGO_USER = "" | ||
MONGO_PASSWORD = "" | ||
MONGO_DB_NAME = "" | ||
|
||
mongo_collection_name_twitter_candidate = '' | ||
mongo_collection_linkage_result = '' | ||
|
||
db = pymysql.connect('localhost', MYSQL_USER, MYSQL_PASSWORD, MYSQL_DB_NAME) | ||
cursor = db.cursor() | ||
|
||
client = pymongo.MongoClient(host='localhost', username = MONGO_USER, | ||
password = MONGO_PASSWORD, authSource = MONGO_DB_NAME, port=27017) | ||
db = client.twitter | ||
|
||
user_id_str2parsed_dname = {} | ||
user_id_str2parsed_sname = {} | ||
|
||
user_id_str2original_sname = {} | ||
for user in db[mongo_collection_name_twitter_candidate].find(): | ||
user_id_str = str(user['id_str']) | ||
parsed_dname = parse_name(user['name']) | ||
parsed_sname = parse_name(user['screen_name']) | ||
|
||
user_id_str2parsed_dname[user_id_str] = parsed_dname | ||
user_id_str2parsed_sname[user_id_str] = parsed_sname | ||
user_id_str2original_sname[user_id_str] = user['screen_name'] | ||
print("size of twitter user", len(user_id_str2parsed_dname)) | ||
|
||
|
||
|
||
cursor.execute('select login, name from users_private where name is not null') | ||
valid_login2parsed_dname = {} | ||
valid_login2parsed_login = {} | ||
|
||
for row in cursor.fetchall(): | ||
login, name = row | ||
if login in identified_login_set: | ||
continue | ||
if len(login) == 8 and login.isupper() == True: | ||
# fake user | ||
continue | ||
|
||
valid_login2parsed_dname[login] = parse_name(name) | ||
valid_login2parsed_login[login] = parse_name(login) | ||
|
||
|
||
valid_login_list = list(valid_login2parsed_dname.keys()) | ||
print('size of github user', len(valid_login2parsed_dname)) | ||
|
||
valid_tw_id_str_list = list(user_id_str2parsed_dname.keys()) | ||
|
||
data_size = len(valid_tw_id_str_list) | ||
total_process_count = 12 | ||
batch_size = int(math.ceil(data_size * 1.0 / total_process_count)) | ||
split_data = [[] for _ in range(total_process_count)] | ||
for data_batch_index in range(total_process_count): | ||
for data_index in range(batch_size*data_batch_index, batch_size*(data_batch_index + 1)): | ||
if data_index < data_size: | ||
split_data[data_batch_index].append(valid_tw_id_str_list[data_index]) | ||
|
||
data_input = [[batch_index, split_data[batch_index]] for batch_index in range(total_process_count)] | ||
|
||
def check_identity_eqal(name_group1_1, name_group1_2, | ||
name_group2_1, name_group2_2): | ||
|
||
if name_group1_1 == name_group2_1 or \ | ||
name_group1_1 == name_group2_2: | ||
pass | ||
else: | ||
return False | ||
|
||
|
||
if name_group1_2 == name_group2_1 or \ | ||
name_group1_2 == name_group2_2: | ||
pass | ||
else: | ||
return False | ||
|
||
|
||
if name_group2_1 == name_group1_1 or \ | ||
name_group2_1 == name_group1_2: | ||
pass | ||
else: | ||
return False | ||
|
||
if name_group2_2 == name_group1_1 or \ | ||
name_group2_2 == name_group1_2: | ||
pass | ||
else: | ||
return False | ||
|
||
|
||
|
||
if name_group1_1 == name_group2_1 or \ | ||
name_group1_2 == name_group2_2: | ||
pass | ||
else: | ||
return False | ||
|
||
|
||
if name_group1_1 == name_group2_2 or \ | ||
name_group1_2 == name_group2_1: | ||
pass | ||
else: | ||
return False | ||
|
||
return True | ||
|
||
|
||
|
||
def check_identity_jksim(name_group1_1, name_group1_2, | ||
name_group2_1, name_group2_2): | ||
|
||
if (jarowinkler.similarity(name_group1_1, name_group2_1) >= name_sim_threshold) or \ | ||
(jarowinkler.similarity(name_group1_1, name_group2_2) >= name_sim_threshold): | ||
pass | ||
else: | ||
return False | ||
|
||
|
||
if (jarowinkler.similarity(name_group1_2, name_group2_1) >= name_sim_threshold) or \ | ||
(jarowinkler.similarity(name_group1_2, name_group2_2) >= name_sim_threshold): | ||
pass | ||
else: | ||
return False | ||
|
||
return True | ||
|
||
def get_linked_user(data_input): | ||
process_index = data_input[0] | ||
tw_id_str_list = data_input[1] | ||
client = pymongo.MongoClient(host='localhost', username = MONGO_USER, | ||
password = MONGO_PASSWORD, authSource = MONGO_DB_NAME, port=27017) | ||
db = client.twitter | ||
range_ = range(len(tw_id_str_list)) | ||
if process_index == 0: | ||
p = progressbar.ProgressBar() | ||
p.start() | ||
range_ = p(range_) | ||
|
||
for tw_id_index in range_: | ||
tw_id_str = tw_id_str_list[tw_id_index] | ||
tw_dname = user_id_str2parsed_dname[tw_id_str] | ||
tw_sname = user_id_str2parsed_sname[tw_id_str] | ||
|
||
for login in valid_login2parsed_dname: | ||
if check_identity_eqal(tw_dname, tw_sname, valid_login2parsed_dname[login], valid_login2parsed_login[login]) == True: | ||
db[mongo_collection_linkage_result].insert_one({'tweet_user_id_str': str(tw_id_str), | ||
'login': login, | ||
'screen_name': user_id_str2original_sname[str(tw_id_str)]}) | ||
|
||
return None | ||
pool = multiprocessing.Pool(total_process_count) | ||
|
||
results = pool.map_async(get_linked_user, data_input).get() |
Oops, something went wrong.