Coverage for .tox/p311/lib/python3.11/site-packages/scicom/utilities/statistics.py: 80%
122 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-05-15 15:09 +0200
« prev ^ index » next coverage.py v7.4.4, created at 2024-05-15 15:09 +0200
1"""Prune a network."""
2import igraph as ig
3import numpy as np
4import pandas as pd
7class PruneNetwork:
8 """Create statistics for communication networks by deletion.
10 For a given dataset with sender and receiver information,
11 create a weighted network with igraph. For a given number
12 of iterations, deletion amounts, and deletion types, the
13 algorithm then generates network statistics for randomly
14 sampled subnetworks.
15 """
17 def __init__(self, dataframe:pd.DataFrame) -> None:
18 """Initialize pruning."""
19 self.inputDF = dataframe
21 def makeNet(self, dataframe:pd.DataFrame) -> ig.Graph:
22 """Create network from dataframe.
24 Assumes the existence of sender, receiver and step
25 column names.
26 """
27 networkdata = dataframe.groupby(["sender", "receiver"]).agg({"step": lambda x: x.to_list()}).reset_index()
28 counts = networkdata.step.apply(lambda x : len(x))
29 networkdata.insert(3, "weight", counts)
30 graph = ig.Graph.TupleList(
31 networkdata.itertuples(index=False), directed=True, edge_attrs=["step", "weight"],
32 )
33 for node in graph.vs:
34 agent = node["name"]
35 edgSend = self.inputDF.query("sender == @agent")
36 maxSend = edgSend.step.max()
37 edgRec = self.inputDF.query("receiver == @agent")
38 maxRec = edgRec.step.max()
39 if maxSend > maxRec or np.isnan(maxRec):
40 lastLoc = edgSend.query("step == @maxSend")["sender_location"].iloc[0]
41 elif maxSend < maxRec or maxSend == maxRec or np.isnan(maxSend):
42 lastLoc = edgRec.query("step == @maxRec")["receiver_location"].iloc[0]
43 else:
44 text = f"No location for agent {agent}, got max send {maxSend} and max rec {maxRec}."
45 raise ValueError(text)
46 node["location"] = lastLoc
47 return graph
49 def setSurvivalProb(self, graph:ig.Graph, *, method:str = "agents", ranked:bool = True) -> pd.DataFrame:
50 """Generate probabilities for different survival modes."""
51 if method == "agents":
52 tempData = pd.DataFrame(
53 {"id": graph.vs["name"], "degree": graph.indegree()},
54 )
55 tempData = tempData.sort_values("degree", ascending=False) if ranked else tempData.sample(frac=1)
56 elif method == "regions":
57 tempData = pd.DataFrame(
58 pd.concat(
59 [self.inputDF.sender_location, self.inputDF.receiver_location],
60 ).unique(), columns = ["location"],
61 )
62 locations = pd.DataFrame({"id":graph.vs["name"], "location":graph.vs["location"]})
63 locations = locations.groupby("location")["id"].nunique().reset_index(name = "count")
64 tempData = tempData.merge(locations, how="left").fillna(0)
65 tempData = tempData.sort_values("count", ascending = False) if ranked else tempData.sample(frac=1)
66 elif method == "time":
67 tempData = pd.DataFrame({"step": range(self.inputDF.step.max() + 1)})
68 tempData = tempData.sort_values("step", ascending = False) if ranked else tempData.sample(frac=1)
69 rng = np.random.default_rng()
70 probabilities = pd.DataFrame(
71 {
72 "unif": -np.sort(-rng.uniform(0, 1, len(tempData))),
73 "log_normal1": -np.sort(-rng.lognormal(0, 1/2, len(tempData))),
74 "log_normal2": -np.sort(-rng.lognormal(0, 1, len(tempData))),
75 "log_normal3": -np.sort(-rng.lognormal(0, 2, len(tempData))),
76 "exp": -np.sort(-rng.exponential(10, len(tempData))),
77 "beta": -np.sort(-rng.beta(a=4, b=5, size=len(tempData))),
78 },
79 )
80 return pd.concat([tempData, probabilities], axis = 1)
82 def scaleSurvivalProb(self, probabilities:pd.DataFrame, *, method:str = "agents") -> pd.DataFrame:
83 """Scale survival for methods agents and regions."""
84 colsType = ["unif", "beta", "exp", "log_normal1", "log_normal2", "log_normal3"]
85 if method == "time":
86 return probabilities
87 if method == "agents":
88 cols = ["sender", "receiver"]
89 cols.extend(colsType)
90 tempData = self.inputDF[["sender", "receiver"]].drop_duplicates().merge(
91 probabilities, left_on="sender", right_on="id",
92 )
93 tempData = tempData.merge(probabilities, left_on="receiver", right_on="id")
94 if method == "regions":
95 cols = ["sender_location", "receiver_location"]
96 cols.extend(colsType)
97 tempData = self.inputDF[["sender_location", "receiver_location"]].drop_duplicates().merge(
98 probabilities, left_on="sender_location", right_on="location",
99 )
100 tempData = tempData.merge(probabilities, left_on="receiver_location", right_on="location")
101 for i in colsType:
102 tempData[i] = tempData[i + "_x"] * tempData[i + "_y"] / np.dot(tempData[i + "_x"], tempData[i + "_y"])
103 return tempData[cols]
105 def basicNetStats(self, graph:ig.Graph) -> pd.DataFrame:
106 """Generate base statistics of network."""
107 #Find the degree centrality
108 tempData = pd.DataFrame({"Degree":graph.degree()})
110 #Find the ranking
111 tempData["Rank"] = tempData["Degree"].rank(method = "min", ascending = False)
113 #Adding other types of centrality
114 tempData["Betweenness"] = graph.betweenness()
115 tempData["Closeness"] = graph.closeness()
116 tempData["Eigenvector"] = graph.eigenvector_centrality()
117 tempData["Page_Rank"] = graph.pagerank()
119 return tempData
121 def netStats(self, G:ig.Graph) -> pd.DataFrame:
122 """Generate network statistics."""
123 #Number of components:
124 no_components = len(G.components())
125 #Number of maximal cliques:
126 # TODO(Malte): Consider if these are necessary. Performance!
127 # no_cliques = len(G.maximal_cliques())
128 #Size of the largest clique:
129 # size_clique = G.omega()
130 #Average path length:
131 avg_path = G.average_path_length()
132 #Diameter:
133 diameter = G.diameter()
134 #Modularity:
135 modularity = G.modularity(G.components())
136 #Transitivity:
137 transitivity = G.transitivity_undirected()
138 #Cohesion
139 cohesion = G.cohesion()
140 #Degree assortativity:
141 assortativity = G.assortativity_degree()
142 #Find the in-degree centrality for each node:
143 indegrees = G.indegree()
144 #Average relative degree:
145 N = len(G.vs)
146 avg_rel_degree = np.mean([x/N for x in indegrees])
147 #Tail estimator (Hill):
148 hill = ig.statistics.power_law_fit(
149 indegrees,
150 xmin=None,
151 method = "hill",
152 ).alpha
153 #Centralization:
154 max_indegree = max(indegrees)
155 centralization = float(N*max_indegree - sum(indegrees))/(N-1)**2
157 return pd.DataFrame([{
158 "no_components":no_components,
159 # "no_cliques":no_cliques,
160 # "size_clique":size_clique,
161 "diameter":diameter,
162 "avg_path":avg_path,
163 "modularity":modularity,
164 "transitivity":transitivity,
165 "cohesion":cohesion,
166 "assortativity":assortativity,
167 "avg_degree":avg_rel_degree,
168 "centralization":centralization,
169 "hill":hill,
170 }])
172 def generatePruningParameters(self, G:ig.Graph) -> pd.DataFrame:
173 """Generate a random set of pruning weights."""
174 nodes = G.get_vertex_dataframe()
175 id2name = G.get_vertex_dataframe().to_dict()["name"]
176 rng = np.random.default_rng()
177 del_parameter = pd.DataFrame(
178 {
179 "ids": nodes.index,
180 "degree": G.degree(),
181 "unif": rng.uniform(0, 1, len(G.vs)),
182 "log_normal": rng.lognormal(0, 1, len(G.vs)),
183 "exp": rng.exponential(1, len(G.vs)),
184 "beta": rng.beta(a=2, b=3, size=len(G.vs)),
185 },
186 )
188 del_parameter = G.get_edge_dataframe()[["source", "target"]].merge(
189 del_parameter, left_on="source", right_on="ids",
190 ).merge(
191 del_parameter, left_on="target", right_on="ids",
192 )
193 del_parameter["degree"] = del_parameter.degree_x * del_parameter.degree_y / np.dot(
194 del_parameter.degree_x, del_parameter.degree_y,
195 )
196 del_parameter["unif"] = del_parameter.unif_x * del_parameter.unif_y / np.dot(
197 del_parameter.unif_x, del_parameter.unif_y,
198 )
199 del_parameter["log_normal"] = del_parameter.log_normal_x * del_parameter.log_normal_y / np.dot(
200 del_parameter.log_normal_x, del_parameter.log_normal_y,
201 )
202 del_parameter["exp"] = del_parameter.exp_x * del_parameter.exp_y / np.dot(
203 del_parameter.exp_x, del_parameter.exp_y,
204 )
205 del_parameter["beta"] = del_parameter.beta_x * del_parameter.beta_y / np.dot(
206 del_parameter.beta_x, del_parameter.beta_y,
207 )
208 sender = del_parameter["source"].apply(lambda x: id2name[x])
209 receiver = del_parameter["target"].apply(lambda x: id2name[x])
210 del_parameter.insert(0, "sender", sender)
211 del_parameter.insert(0, "receiver", receiver)
212 return del_parameter[
213 ["sender", "receiver", "degree", "unif", "log_normal", "exp", "beta"]
214 ]
217 def deleteFromNetwork(
218 self,
219 iterations: int = 10,
220 delAmounts: tuple = (0.1, 0.25, 0.5, 0.75, 0.9),
221 delTypes: tuple = ("unif", "log_normal1", "exp", "beta", "log_normal2", "log_normal3"),
222 delMethod: tuple = ("agents", "regions", "time"),
223 rankedVals: tuple = (True, False),
224 ) -> pd.DataFrame:
225 """Run the deletion by sampling."""
226 results = []
227 fullNet = self.makeNet(
228 self.inputDF,
229 )
230 fullStats = self.netStats(fullNet)
231 fullStats = fullStats.assign(
232 delVal=0, delType="NA", delIteration=0, delMethod="NA", rankedVal="NA",
233 )
234 results.append(fullStats)
235 for idx in range(1, iterations + 1):
236 for method in delMethod:
237 for ranked in rankedVals:
238 probVals = self.setSurvivalProb(
239 fullNet, method=method, ranked=ranked,
240 )
241 prunVals = self.scaleSurvivalProb(
242 probVals, method=method,
243 )
244 tempDF = self.inputDF.merge(
245 prunVals,
246 )
247 for val in list(delAmounts):
248 for deltype in list(delTypes):
249 delDF = tempDF.sample(
250 frac = (1 - val),
251 weights=deltype,
252 )
253 delNet = self.makeNet(delDF)
254 delStats = self.netStats(delNet)
255 delStats = delStats.assign(
256 delVal=val, delType=deltype, delIteration=idx, delMethod=method, rankedVal=ranked,
257 )
258 results.append(delStats)
259 return pd.concat(results)
263def prune(
264 modelparameters: dict,
265 network: tuple,
266 columns: list,
267 iterations: int = 10,
268 delAmounts: tuple = (0.1, 0.25, 0.5, 0.75, 0.9),
269 delTypes: tuple = ("unif", "log_normal1", "exp", "beta", "log_normal2", "log_normal3"),
270 delMethod: tuple = ("agents", "regions", "time"),
271 rankedVals: tuple = (True, False)) -> pd.DataFrame:
272 """Generate pruned networks from input.
274 Assumes existence of columns "sender", "receiver",
275 "sender_location", "receiver_location" and "step".
276 """
277 runDf = pd.DataFrame(network, columns = columns)
278 pruning = PruneNetwork(runDf)
279 result = pruning.deleteFromNetwork(
280 iterations=iterations,
281 delAmounts=delAmounts,
282 delTypes=delTypes,
283 delMethod=delMethod,
284 rankedVals=rankedVals,
285 )
286 return result.assign(**modelparameters)