Coverage for .tox/p311/lib/python3.11/site-packages/scicom/utilities/statistics.py: 80%

122 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-05-15 15:09 +0200

1"""Prune a network.""" 

2import igraph as ig 

3import numpy as np 

4import pandas as pd 

5 

6 

7class PruneNetwork: 

8 """Create statistics for communication networks by deletion. 

9 

10 For a given dataset with sender and receiver information, 

11 create a weighted network with igraph. For a given number 

12 of iterations, deletion amounts, and deletion types, the 

13 algorithm then generates network statistics for randomly 

14 sampled subnetworks. 

15 """ 

16 

17 def __init__(self, dataframe:pd.DataFrame) -> None: 

18 """Initialize pruning.""" 

19 self.inputDF = dataframe 

20 

21 def makeNet(self, dataframe:pd.DataFrame) -> ig.Graph: 

22 """Create network from dataframe. 

23 

24 Assumes the existence of sender, receiver and step 

25 column names. 

26 """ 

27 networkdata = dataframe.groupby(["sender", "receiver"]).agg({"step": lambda x: x.to_list()}).reset_index() 

28 counts = networkdata.step.apply(lambda x : len(x)) 

29 networkdata.insert(3, "weight", counts) 

30 graph = ig.Graph.TupleList( 

31 networkdata.itertuples(index=False), directed=True, edge_attrs=["step", "weight"], 

32 ) 

33 for node in graph.vs: 

34 agent = node["name"] 

35 edgSend = self.inputDF.query("sender == @agent") 

36 maxSend = edgSend.step.max() 

37 edgRec = self.inputDF.query("receiver == @agent") 

38 maxRec = edgRec.step.max() 

39 if maxSend > maxRec or np.isnan(maxRec): 

40 lastLoc = edgSend.query("step == @maxSend")["sender_location"].iloc[0] 

41 elif maxSend < maxRec or maxSend == maxRec or np.isnan(maxSend): 

42 lastLoc = edgRec.query("step == @maxRec")["receiver_location"].iloc[0] 

43 else: 

44 text = f"No location for agent {agent}, got max send {maxSend} and max rec {maxRec}." 

45 raise ValueError(text) 

46 node["location"] = lastLoc 

47 return graph 

48 

49 def setSurvivalProb(self, graph:ig.Graph, *, method:str = "agents", ranked:bool = True) -> pd.DataFrame: 

50 """Generate probabilities for different survival modes.""" 

51 if method == "agents": 

52 tempData = pd.DataFrame( 

53 {"id": graph.vs["name"], "degree": graph.indegree()}, 

54 ) 

55 tempData = tempData.sort_values("degree", ascending=False) if ranked else tempData.sample(frac=1) 

56 elif method == "regions": 

57 tempData = pd.DataFrame( 

58 pd.concat( 

59 [self.inputDF.sender_location, self.inputDF.receiver_location], 

60 ).unique(), columns = ["location"], 

61 ) 

62 locations = pd.DataFrame({"id":graph.vs["name"], "location":graph.vs["location"]}) 

63 locations = locations.groupby("location")["id"].nunique().reset_index(name = "count") 

64 tempData = tempData.merge(locations, how="left").fillna(0) 

65 tempData = tempData.sort_values("count", ascending = False) if ranked else tempData.sample(frac=1) 

66 elif method == "time": 

67 tempData = pd.DataFrame({"step": range(self.inputDF.step.max() + 1)}) 

68 tempData = tempData.sort_values("step", ascending = False) if ranked else tempData.sample(frac=1) 

69 rng = np.random.default_rng() 

70 probabilities = pd.DataFrame( 

71 { 

72 "unif": -np.sort(-rng.uniform(0, 1, len(tempData))), 

73 "log_normal1": -np.sort(-rng.lognormal(0, 1/2, len(tempData))), 

74 "log_normal2": -np.sort(-rng.lognormal(0, 1, len(tempData))), 

75 "log_normal3": -np.sort(-rng.lognormal(0, 2, len(tempData))), 

76 "exp": -np.sort(-rng.exponential(10, len(tempData))), 

77 "beta": -np.sort(-rng.beta(a=4, b=5, size=len(tempData))), 

78 }, 

79 ) 

80 return pd.concat([tempData, probabilities], axis = 1) 

81 

82 def scaleSurvivalProb(self, probabilities:pd.DataFrame, *, method:str = "agents") -> pd.DataFrame: 

83 """Scale survival for methods agents and regions.""" 

84 colsType = ["unif", "beta", "exp", "log_normal1", "log_normal2", "log_normal3"] 

85 if method == "time": 

86 return probabilities 

87 if method == "agents": 

88 cols = ["sender", "receiver"] 

89 cols.extend(colsType) 

90 tempData = self.inputDF[["sender", "receiver"]].drop_duplicates().merge( 

91 probabilities, left_on="sender", right_on="id", 

92 ) 

93 tempData = tempData.merge(probabilities, left_on="receiver", right_on="id") 

94 if method == "regions": 

95 cols = ["sender_location", "receiver_location"] 

96 cols.extend(colsType) 

97 tempData = self.inputDF[["sender_location", "receiver_location"]].drop_duplicates().merge( 

98 probabilities, left_on="sender_location", right_on="location", 

99 ) 

100 tempData = tempData.merge(probabilities, left_on="receiver_location", right_on="location") 

101 for i in colsType: 

102 tempData[i] = tempData[i + "_x"] * tempData[i + "_y"] / np.dot(tempData[i + "_x"], tempData[i + "_y"]) 

103 return tempData[cols] 

104 

105 def basicNetStats(self, graph:ig.Graph) -> pd.DataFrame: 

106 """Generate base statistics of network.""" 

107 #Find the degree centrality 

108 tempData = pd.DataFrame({"Degree":graph.degree()}) 

109 

110 #Find the ranking 

111 tempData["Rank"] = tempData["Degree"].rank(method = "min", ascending = False) 

112 

113 #Adding other types of centrality 

114 tempData["Betweenness"] = graph.betweenness() 

115 tempData["Closeness"] = graph.closeness() 

116 tempData["Eigenvector"] = graph.eigenvector_centrality() 

117 tempData["Page_Rank"] = graph.pagerank() 

118 

119 return tempData 

120 

121 def netStats(self, G:ig.Graph) -> pd.DataFrame: 

122 """Generate network statistics.""" 

123 #Number of components: 

124 no_components = len(G.components()) 

125 #Number of maximal cliques: 

126 # TODO(Malte): Consider if these are necessary. Performance! 

127 # no_cliques = len(G.maximal_cliques()) 

128 #Size of the largest clique: 

129 # size_clique = G.omega() 

130 #Average path length: 

131 avg_path = G.average_path_length() 

132 #Diameter: 

133 diameter = G.diameter() 

134 #Modularity: 

135 modularity = G.modularity(G.components()) 

136 #Transitivity: 

137 transitivity = G.transitivity_undirected() 

138 #Cohesion 

139 cohesion = G.cohesion() 

140 #Degree assortativity: 

141 assortativity = G.assortativity_degree() 

142 #Find the in-degree centrality for each node: 

143 indegrees = G.indegree() 

144 #Average relative degree: 

145 N = len(G.vs) 

146 avg_rel_degree = np.mean([x/N for x in indegrees]) 

147 #Tail estimator (Hill): 

148 hill = ig.statistics.power_law_fit( 

149 indegrees, 

150 xmin=None, 

151 method = "hill", 

152 ).alpha 

153 #Centralization: 

154 max_indegree = max(indegrees) 

155 centralization = float(N*max_indegree - sum(indegrees))/(N-1)**2 

156 

157 return pd.DataFrame([{ 

158 "no_components":no_components, 

159 # "no_cliques":no_cliques, 

160 # "size_clique":size_clique, 

161 "diameter":diameter, 

162 "avg_path":avg_path, 

163 "modularity":modularity, 

164 "transitivity":transitivity, 

165 "cohesion":cohesion, 

166 "assortativity":assortativity, 

167 "avg_degree":avg_rel_degree, 

168 "centralization":centralization, 

169 "hill":hill, 

170 }]) 

171 

172 def generatePruningParameters(self, G:ig.Graph) -> pd.DataFrame: 

173 """Generate a random set of pruning weights.""" 

174 nodes = G.get_vertex_dataframe() 

175 id2name = G.get_vertex_dataframe().to_dict()["name"] 

176 rng = np.random.default_rng() 

177 del_parameter = pd.DataFrame( 

178 { 

179 "ids": nodes.index, 

180 "degree": G.degree(), 

181 "unif": rng.uniform(0, 1, len(G.vs)), 

182 "log_normal": rng.lognormal(0, 1, len(G.vs)), 

183 "exp": rng.exponential(1, len(G.vs)), 

184 "beta": rng.beta(a=2, b=3, size=len(G.vs)), 

185 }, 

186 ) 

187 

188 del_parameter = G.get_edge_dataframe()[["source", "target"]].merge( 

189 del_parameter, left_on="source", right_on="ids", 

190 ).merge( 

191 del_parameter, left_on="target", right_on="ids", 

192 ) 

193 del_parameter["degree"] = del_parameter.degree_x * del_parameter.degree_y / np.dot( 

194 del_parameter.degree_x, del_parameter.degree_y, 

195 ) 

196 del_parameter["unif"] = del_parameter.unif_x * del_parameter.unif_y / np.dot( 

197 del_parameter.unif_x, del_parameter.unif_y, 

198 ) 

199 del_parameter["log_normal"] = del_parameter.log_normal_x * del_parameter.log_normal_y / np.dot( 

200 del_parameter.log_normal_x, del_parameter.log_normal_y, 

201 ) 

202 del_parameter["exp"] = del_parameter.exp_x * del_parameter.exp_y / np.dot( 

203 del_parameter.exp_x, del_parameter.exp_y, 

204 ) 

205 del_parameter["beta"] = del_parameter.beta_x * del_parameter.beta_y / np.dot( 

206 del_parameter.beta_x, del_parameter.beta_y, 

207 ) 

208 sender = del_parameter["source"].apply(lambda x: id2name[x]) 

209 receiver = del_parameter["target"].apply(lambda x: id2name[x]) 

210 del_parameter.insert(0, "sender", sender) 

211 del_parameter.insert(0, "receiver", receiver) 

212 return del_parameter[ 

213 ["sender", "receiver", "degree", "unif", "log_normal", "exp", "beta"] 

214 ] 

215 

216 

217 def deleteFromNetwork( 

218 self, 

219 iterations: int = 10, 

220 delAmounts: tuple = (0.1, 0.25, 0.5, 0.75, 0.9), 

221 delTypes: tuple = ("unif", "log_normal1", "exp", "beta", "log_normal2", "log_normal3"), 

222 delMethod: tuple = ("agents", "regions", "time"), 

223 rankedVals: tuple = (True, False), 

224 ) -> pd.DataFrame: 

225 """Run the deletion by sampling.""" 

226 results = [] 

227 fullNet = self.makeNet( 

228 self.inputDF, 

229 ) 

230 fullStats = self.netStats(fullNet) 

231 fullStats = fullStats.assign( 

232 delVal=0, delType="NA", delIteration=0, delMethod="NA", rankedVal="NA", 

233 ) 

234 results.append(fullStats) 

235 for idx in range(1, iterations + 1): 

236 for method in delMethod: 

237 for ranked in rankedVals: 

238 probVals = self.setSurvivalProb( 

239 fullNet, method=method, ranked=ranked, 

240 ) 

241 prunVals = self.scaleSurvivalProb( 

242 probVals, method=method, 

243 ) 

244 tempDF = self.inputDF.merge( 

245 prunVals, 

246 ) 

247 for val in list(delAmounts): 

248 for deltype in list(delTypes): 

249 delDF = tempDF.sample( 

250 frac = (1 - val), 

251 weights=deltype, 

252 ) 

253 delNet = self.makeNet(delDF) 

254 delStats = self.netStats(delNet) 

255 delStats = delStats.assign( 

256 delVal=val, delType=deltype, delIteration=idx, delMethod=method, rankedVal=ranked, 

257 ) 

258 results.append(delStats) 

259 return pd.concat(results) 

260 

261 

262 

263def prune( 

264 modelparameters: dict, 

265 network: tuple, 

266 columns: list, 

267 iterations: int = 10, 

268 delAmounts: tuple = (0.1, 0.25, 0.5, 0.75, 0.9), 

269 delTypes: tuple = ("unif", "log_normal1", "exp", "beta", "log_normal2", "log_normal3"), 

270 delMethod: tuple = ("agents", "regions", "time"), 

271 rankedVals: tuple = (True, False)) -> pd.DataFrame: 

272 """Generate pruned networks from input. 

273 

274 Assumes existence of columns "sender", "receiver", 

275 "sender_location", "receiver_location" and "step". 

276 """ 

277 runDf = pd.DataFrame(network, columns = columns) 

278 pruning = PruneNetwork(runDf) 

279 result = pruning.deleteFromNetwork( 

280 iterations=iterations, 

281 delAmounts=delAmounts, 

282 delTypes=delTypes, 

283 delMethod=delMethod, 

284 rankedVals=rankedVals, 

285 ) 

286 return result.assign(**modelparameters)