import json
from itertools import combinations
from typing import Union, Iterable, Tuple
import pandas as pd
import networkx as nx
def cross_ref_to_index(df: pd.DataFrame, id_column: str, match_key_columns: Union[str, list[str]]) -> pd.MultiIndex:
match_lists = df.sort_values(id_column, ascending=False).groupby(match_key_columns)[id_column].apply(lambda s: list(s))
match_lists = match_lists.loc[match_lists.apply(lambda s: len(s)) > 1]
match_pairs = []
for match_list in match_lists:
match_pairs += list(combinations(match_list, 2))
return pd.MultiIndex.from_tuples(match_pairs)
def edges_to_cross_ref(edges: Iterable[Tuple], name_original_id: str, name_resolved_id: str) -> pd.DataFrame:
"""Applies transitive closure and returns resuts as a cross-reference table."""
graph = nx.from_edgelist(edges)
sub_graphs = [graph.subgraph(c) for c in nx.connected_components(graph)]
match_groups = []
for i, sub_graph in enumerate(sub_graphs):
match_groups.append(pd.DataFrame({name_original_id: sub_graph.nodes})
.assign(**{name_resolved_id: i}))
return pd.concat(match_groups, ignore_index=True)
# Read the settlements dataset provided in JSON line format:
data = []
with open('geo_settlements/settlements.json', 'r') as f:
for line in f:
data.append(json.loads(line))
settlements = pd.json_normalize(data).rename(columns={'data.lon': 'lon', 'data.lat': 'lat', 'data.label': 'name'})[['id', 'name', 'lon', 'lat']]
print('Example records: \n', settlements.head())
# Read the actual cluster assignments:
data = []
with open('geo_settlements/combinedSettlements(PerfectMatch).json', 'r') as f:
for line in f:
data.append(json.loads(line)['data'])
true_clusters = pd.json_normalize(data)['clusteredVertices']
true_clusters = pd.concat([pd.DataFrame({'id': cluster}).assign(cluster=i) for i, cluster in true_clusters.items()]).reset_index(drop=True)
# Transform actual matches into MultiIndex to meet recordlinkage standards:
true_matches = cross_ref_to_index(df=true_clusters, id_column='id', match_key_columns='cluster')
print('\nNumber of actual matches: ', true_matches.shape[0])
print('Distribution of actual cluster sizes: \n', true_clusters.cluster.value_counts().value_counts())