Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1#!/usr/bin/env python 

2 

3""" 

4camcops_server/cc_modules/cc_anon.py 

5 

6=============================================================================== 

7 

8 Copyright (C) 2012-2020 Rudolf Cardinal (rudolf@pobox.com). 

9 

10 This file is part of CamCOPS. 

11 

12 CamCOPS is free software: you can redistribute it and/or modify 

13 it under the terms of the GNU General Public License as published by 

14 the Free Software Foundation, either version 3 of the License, or 

15 (at your option) any later version. 

16 

17 CamCOPS is distributed in the hope that it will be useful, 

18 but WITHOUT ANY WARRANTY; without even the implied warranty of 

19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

20 GNU General Public License for more details. 

21 

22 You should have received a copy of the GNU General Public License 

23 along with CamCOPS. If not, see <https://www.gnu.org/licenses/>. 

24 

25=============================================================================== 

26 

27**Anonymisation functions.** 

28 

29Largely superseded by CRATE (https://doi.org/10.1186%2Fs12911-017-0437-1). 

30 

31""" 

32 

33from collections import OrderedDict 

34import csv 

35import sys 

36from typing import ( 

37 Dict, List, Generator, TextIO, Tuple, TYPE_CHECKING, Union, 

38) 

39 

40from cardinal_pythonlib.sqlalchemy.orm_inspect import coltype_as_typeengine 

41from cardinal_pythonlib.sqlalchemy.schema import ( 

42 convert_sqla_type_for_dialect, 

43 does_sqlatype_require_index_len, 

44 is_sqlatype_date, 

45 is_sqlatype_text_of_length_at_least, 

46 RE_COLTYPE_WITH_ONE_PARAM, 

47) 

48from cardinal_pythonlib.sqlalchemy.session import SQLITE_MEMORY_URL 

49# from sqlalchemy.dialects.mssql.base import MSDialect 

50from sqlalchemy.dialects.mysql.base import MySQLDialect 

51from sqlalchemy.engine import create_engine 

52from sqlalchemy.engine.interfaces import Dialect 

53from sqlalchemy.orm import Session as SqlASession, sessionmaker 

54from sqlalchemy.sql.schema import Column 

55 

56from camcops_server.cc_modules.cc_constants import TABLET_ID_FIELD 

57from camcops_server.cc_modules.cc_db import FN_PK 

58from camcops_server.cc_modules.cc_dump import DumpController 

59from camcops_server.cc_modules.cc_patient import Patient 

60from camcops_server.cc_modules.cc_patientidnum import ( 

61 extra_id_colname, 

62 EXTRA_IDNUM_FIELD_PREFIX, 

63) 

64from camcops_server.cc_modules.cc_simpleobjects import TaskExportOptions 

65from camcops_server.cc_modules.cc_sqla_coltypes import CamcopsColumn 

66 

67if TYPE_CHECKING: 

68 from camcops_server.cc_modules.cc_exportrecipientinfo import ExportRecipientInfo # noqa 

69 from camcops_server.cc_modules.cc_request import CamcopsRequest 

70 

71# ============================================================================= 

72# Constants 

73# ============================================================================= 

74 

75MIN_STRING_LENGTH_TO_CONSIDER_SCRUBBING = 256 

76 

77 

78# ============================================================================= 

79# Write data dictionaries for anonymisation tools 

80# ============================================================================= 

81 

82def _gen_columns_for_anon_staging_db(req: "CamcopsRequest", 

83 recipient: "ExportRecipientInfo") \ 

84 -> Generator[Union[Column, CamcopsColumn], None, None]: 

85 """ 

86 Generates all columns for an anonymisation staging database. 

87 """ 

88 url = SQLITE_MEMORY_URL 

89 engine = create_engine(url, echo=False) 

90 session = sessionmaker(bind=engine)() # type: SqlASession 

91 export_options = TaskExportOptions( 

92 include_blobs=recipient.db_include_blobs, 

93 db_patient_id_per_row=recipient.db_patient_id_per_row, 

94 db_make_all_tables_even_empty=True, 

95 db_include_summaries=recipient.db_add_summaries, 

96 ) 

97 

98 dc = DumpController(dst_engine=engine, 

99 dst_session=session, 

100 export_options=export_options, 

101 req=req) 

102 for col in dc.gen_all_dest_columns(): 

103 yield col 

104 

105 

106# ----------------------------------------------------------------------------- 

107# CRIS 

108# ----------------------------------------------------------------------------- 

109 

110def _get_type_size_as_text_from_sqltype(sqltype: str) -> Tuple[str, str]: 

111 """ 

112 Splits SQL size definitions like ``VARCHAR(10)`` into tuples like 

113 ``('VARCHAR', '10')`` If it doesn't fit that format, return 

114 ``(sqltype, '')``. 

115 """ 

116 m = RE_COLTYPE_WITH_ONE_PARAM.match(sqltype) 

117 if m is not None: 

118 finaltype = m.group('type').upper() 

119 size = m.group('size').strip().upper() 

120 else: 

121 size = "" 

122 finaltype = sqltype 

123 return finaltype, size 

124 

125 

126# noinspection PyUnusedLocal 

127def _get_cris_dd_row(column: Union[Column, CamcopsColumn, None], 

128 recipient: "ExportRecipientInfo", 

129 dest_dialect: Dialect = None) -> Dict: 

130 """ 

131 Args: 

132 column: 

133 A column specification (or ``None`` to create a dummy dictionary). 

134 dest_dialect: 

135 The SQL dialect of the destination database. If ``None``, then 

136 MySQL is used as the default. 

137 

138 Returns: 

139 An :class:`OrderedDict` with information for a CRIS data dictionary 

140 row. 

141 """ 

142 dest_dialect = dest_dialect or MySQLDialect() # MSDialect() for SQL Server 

143 valid_values = None 

144 if column is None: 

145 # Dummy row 

146 colname = None 

147 tablename = None 

148 taskname = None 

149 comment = None 

150 feft = None 

151 security_status = None 

152 finaltype = None 

153 tlfa = None 

154 size = None 

155 else: 

156 colname = column.name 

157 tablename = column.table.name 

158 taskname = tablename 

159 comment = column.comment 

160 coltype = coltype_as_typeengine(column.type) 

161 is_free_text = is_sqlatype_text_of_length_at_least( 

162 coltype, min_length=MIN_STRING_LENGTH_TO_CONSIDER_SCRUBBING) 

163 exempt_from_anonymisation = False 

164 identifies_patient = False 

165 

166 if isinstance(column, CamcopsColumn): 

167 exempt_from_anonymisation = column.exempt_from_anonymisation 

168 identifies_patient = column.identifies_patient 

169 if column.permitted_value_checker: 

170 valid_values = column.permitted_value_checker.permitted_values_csv() # noqa 

171 

172 needs_scrubbing = is_free_text and not exempt_from_anonymisation 

173 

174 # Tag list - fields anon 

175 tlfa = "Y" if needs_scrubbing else "" 

176 

177 # Destination SQL type 

178 desttype = convert_sqla_type_for_dialect( 

179 coltype=coltype, 

180 dialect=dest_dialect, 

181 strip_collation=True, 

182 expand_for_scrubbing=needs_scrubbing, 

183 ) 

184 destsqltype = desttype.compile(dialect=dest_dialect) 

185 finaltype, size = _get_type_size_as_text_from_sqltype(destsqltype) 

186 

187 # Security status 

188 system_id = ( 

189 colname == TABLET_ID_FIELD or 

190 colname.endswith("_id") 

191 ) 

192 patient_idnum_field = colname.startswith(EXTRA_IDNUM_FIELD_PREFIX) 

193 internal_field = colname.startswith("_") 

194 if identifies_patient and (tablename == Patient.__tablename__ and 

195 colname == Patient.dob.name): 

196 security_status = 3 # truncate (e.g. DOB, postcode) 

197 elif identifies_patient and tablename == Patient.__tablename__: 

198 security_status = 2 # use to scrub 

199 elif system_id or internal_field or identifies_patient: 

200 security_status = 1 # drop (e.g. for pointless internal keys) 

201 else: 

202 security_status = 4 # bring through 

203 

204 # Front end field type 

205 if system_id or patient_idnum_field: 

206 feft = 34 # patient ID; other internal keys 

207 elif is_sqlatype_date(coltype): 

208 feft = 4 # dates 

209 elif is_free_text: 

210 feft = 3 # giant free text, I think 

211 elif valid_values is not None: 

212 feft = 2 # picklist 

213 else: 

214 feft = 1 # text, numbers 

215 

216 return OrderedDict([ 

217 ("Tab", "CamCOPS"), 

218 ("Form name", taskname), 

219 ("CRIS tree label", colname), 

220 ("Source system table name", tablename), 

221 ("SQL column name", colname), 

222 ("Front end field type", feft), 

223 ("Valid values", valid_values), 

224 ("Result column name", colname), 

225 ("Family doc tab name", ""), 

226 ("Family doc form name", ""), 

227 ("Security status", security_status), 

228 ("Exclude", ""), 

229 ("End SQL Type", finaltype), 

230 ("Header field (Y/N)", ""), 

231 ("Header field name", ""), 

232 ("Header field active (Y/N)", ""), 

233 ("View name", ""), 

234 ("Exclude from family doc", ""), 

235 ("Tag list - fields anon", tlfa), 

236 ("Anon type", ""), # formerly "Additional info" 

237 ("Form start date", ""), 

238 ("Form end date", ""), 

239 ("Source", ""), 

240 ("Size", size), 

241 ("Header logic", ""), 

242 ("Patient/contact", ""), 

243 ("Comments", comment), 

244 ]) 

245 

246 

247def write_cris_data_dictionary(req: "CamcopsRequest", 

248 recipient: "ExportRecipientInfo", 

249 file: TextIO = sys.stdout) -> None: 

250 """ 

251 Generates a draft CRIS data dictionary. 

252 

253 CRIS is an anonymisation tool. See 

254 

255 - Stewart R, Soremekun M, Perera G, Broadbent M, Callard F, Denis M, Hotopf 

256 M, Thornicroft G, Lovestone S (2009). 

257 The South London and Maudsley NHS Foundation Trust Biomedical Research 

258 Centre (SLAM BRC) case register: development and descriptive data. 

259 *BMC Psychiatry* 9: 51. 

260 https://www.ncbi.nlm.nih.gov/pubmed/19674459 

261 

262 - Fernandes AC, Cloete D, Broadbent MT, Hayes RD, Chang CK, Jackson RG, 

263 Roberts A, Tsang J, Soncul M, Liebscher J, Stewart R, Callard F (2013). 

264 Development and evaluation of a de-identification procedure for a case 

265 register sourced from mental health electronic records. 

266 *BMC Med Inform Decis Mak.* 13: 71. 

267 https://www.ncbi.nlm.nih.gov/pubmed/23842533 

268 

269 Args: 

270 req: a :class:`camcops_server.cc_modules.cc_request.CamcopsRequest` 

271 recipient: a :class:`camcops_server.cc_modules.cc_exportrecipientinfo.ExportRecipientInfo` 

272 file: output file 

273 """ # noqa 

274 dummy = _get_cris_dd_row(column=None, recipient=recipient) 

275 wr = csv.DictWriter(file, fieldnames=list(dummy.keys())) 

276 wr.writeheader() 

277 for col in _gen_columns_for_anon_staging_db(req, recipient): 

278 d = _get_cris_dd_row(column=col, recipient=recipient) 

279 wr.writerow(d) 

280 

281 

282# ----------------------------------------------------------------------------- 

283# CRATE 

284# ----------------------------------------------------------------------------- 

285 

286def _get_crate_dd_row(column: Union[Column, CamcopsColumn, None], 

287 recipient: "ExportRecipientInfo", 

288 dest_dialect: Dialect = None, 

289 src_db: str = "camcops", 

290 default_indexlen: int = 100) -> Dict: 

291 """ 

292 Args: 

293 column: 

294 A column specification (or ``None`` to create a dummy dictionary). 

295 recipient:  

296 a :class:`camcops_server.cc_modules.cc_exportrecipientinfo.ExportRecipientInfo` 

297 dest_dialect: 

298 The SQL dialect of the destination database. If ``None``, then 

299 MySQL is used as the default. 

300 src_db: 

301 Value to be used for the "src_db" field. 

302 default_indexlen: 

303 Default index length for fields that require one. 

304 

305 Returns: 

306 An :class:`OrderedDict` with information for a CRATE data dictionary 

307 row. 

308 """ # noqa 

309 dest_dialect = dest_dialect or MySQLDialect() 

310 exempt_from_anonymisation = False 

311 identifies_patient = False 

312 identifies_respondent = False 

313 force_include = False 

314 if column is None: 

315 # Dummy row 

316 colname = None 

317 tablename = None 

318 comment = None 

319 coltype = None 

320 needs_scrubbing = False 

321 desttype = None 

322 destsqltype = None 

323 else: 

324 colname = column.name 

325 tablename = column.table.name 

326 comment = column.comment 

327 coltype = coltype_as_typeengine(column.type) 

328 is_free_text = is_sqlatype_text_of_length_at_least( 

329 coltype, min_length=MIN_STRING_LENGTH_TO_CONSIDER_SCRUBBING) 

330 

331 if isinstance(column, CamcopsColumn): 

332 exempt_from_anonymisation = column.exempt_from_anonymisation 

333 identifies_patient = column.identifies_patient 

334 force_include = column.include_in_anon_staging_db 

335 

336 needs_scrubbing = is_free_text and not exempt_from_anonymisation 

337 desttype = convert_sqla_type_for_dialect( 

338 coltype=coltype, 

339 dialect=dest_dialect, 

340 strip_collation=True, 

341 expand_for_scrubbing=needs_scrubbing, 

342 ) 

343 destsqltype = desttype.compile(dialect=dest_dialect) 

344 

345 # src_flags 

346 src_flags = [] # type: List[str] 

347 primary_key = colname == FN_PK 

348 if primary_key: 

349 src_flags.extend(["K", "C"]) 

350 primary_pid = ( 

351 recipient.db_patient_id_per_row and # otherwise just in PatientIdNum 

352 recipient.primary_idnum and 

353 colname == extra_id_colname(recipient.primary_idnum) 

354 ) 

355 if primary_pid: 

356 src_flags.append("P") 

357 defines_primary_pids = False # no single unique table for this... 

358 if defines_primary_pids: 

359 src_flags.append("*") 

360 master_pid = False # not supported for now 

361 if master_pid: 

362 src_flags.append("M") 

363 

364 # scrub_src 

365 if identifies_patient and tablename == Patient.__tablename__: 

366 scrub_src = "patient" 

367 elif identifies_respondent: 

368 scrub_src = "thirdparty" 

369 else: 

370 scrub_src = None 

371 

372 # scrub_method 

373 scrub_method = None # default is fine 

374 

375 # Include in output? 

376 include = ( 

377 force_include or 

378 primary_key or 

379 primary_pid or 

380 master_pid or 

381 not (identifies_patient or identifies_respondent) 

382 ) 

383 

384 # alter_method 

385 if needs_scrubbing: 

386 alter_method = "scrub" 

387 elif tablename == Patient.__tablename__ and colname == Patient.dob.name: 

388 alter_method = "truncate_date" 

389 else: 

390 alter_method = None 

391 

392 # Indexing 

393 crate_index = None 

394 crate_indexlen = None 

395 if column is not None and column.index: 

396 crate_index = "U" if column.unique else "I" 

397 if does_sqlatype_require_index_len(desttype): 

398 crate_indexlen = default_indexlen 

399 

400 return OrderedDict([ 

401 ("src_db", src_db), 

402 ("src_table", tablename), 

403 ("src_field", colname), 

404 ("src_datatype", str(coltype)), 

405 ("src_flags", "".join(src_flags) if src_flags else None), 

406 ("scrub_src", scrub_src), 

407 ("scrub_method", scrub_method), 

408 ("decision", "include" if include else "OMIT"), 

409 ("inclusion_values", None), 

410 ("exclusion_values", None), 

411 ("alter_method", alter_method), 

412 ("dest_table", tablename), 

413 ("dest_field", colname), 

414 ("dest_datatype", destsqltype), 

415 ("index", crate_index), 

416 ("indexlen", crate_indexlen), 

417 ("comment", comment), 

418 ]) 

419 

420 

421def write_crate_data_dictionary(req: "CamcopsRequest", 

422 recipient: "ExportRecipientInfo", 

423 file: TextIO = sys.stdout) -> None: 

424 """ 

425 Generates a draft CRATE data dictionary. 

426  

427 CRATE is an anonymisation tool. See: 

428 

429 - Cardinal RN (2017). 

430 Clinical records anonymisation and text extraction (CRATE): an 

431 open-source software system. 

432 *BMC Medical Informatics and Decision Making* 17: 50. 

433 https://www.pubmed.gov/28441940; 

434 https://doi.org/10.1186/s12911-017-0437-1. 

435 

436 - https://crateanon.readthedocs.io/ 

437 

438 Args: 

439 req: a :class:`camcops_server.cc_modules.cc_request.CamcopsRequest` 

440 recipient: a :class:`camcops_server.cc_modules.cc_exportrecipientinfo.ExportRecipientInfo` 

441 file: output file 

442 """ # noqa 

443 dummy = _get_crate_dd_row(column=None, recipient=recipient) 

444 wr = csv.DictWriter(file, fieldnames=list(dummy.keys())) 

445 wr.writeheader() 

446 for col in _gen_columns_for_anon_staging_db(req, recipient): 

447 d = _get_crate_dd_row(column=col, recipient=recipient) 

448 wr.writerow(d)