Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1"""Parse CSV alike data from different sources including: 

2 

3- markdown tables (e.g. from documentation) 

4 

5 

6Future: 

7- csv 

8- excel 

9 

10""" 

11import asyncio 

12import inspect 

13import sys 

14import time 

15import re 

16import pandas as pd 

17from datetime import timedelta as timedelta 

18from datetime import datetime 

19 

20from io import StringIO 

21# import uuid 

22import random 

23import yaml 

24 

25import parse # https://github.com/r1chardj0n3s/parse 

26 

27from gutools.tools import get_calling_function 

28 

29_regx_eval = re.compile(r"(\<(.*)\>)") 

30 

31_regx_param = re.compile(r""" 

32(?P<idx>[\*\s]+)? 

33(?P<format> 

34\{? 

35 (?P<param>[_a-z][^:\*\}]*) 

36 (:(?P<fmt>[^\*\}]*))? 

37\}? 

38) 

39(?P<idx2>[\*\s]+)? 

40 

41""", re.VERBOSE | re.IGNORECASE) 

42 

43m = _regx_param.match('*{seq:d}*') 

44print(m.groupdict()) 

45m = _regx_param.match('{seq:d}') 

46print(m.groupdict()) 

47m = _regx_param.match('*seq*') 

48print(m.groupdict()) 

49m = _regx_param.match('seq') 

50print(m.groupdict()) 

51m = _regx_param.match('{bid1}') 

52print(m.groupdict()) 

53m = _regx_param.match('{_foo}') 

54print(m.groupdict()) 

55m = _regx_param.match('{0}') 

56assert not m 

57 

58 

59default_converters = {'seq': int, 'p0': float, 'p1': float, 'date': datetime.fromisoformat, 

60 'type': float, 'amount': float, 'price': float, 

61 'lid': int, 'exp': float, 'fill': float, 

62 'uf': int, 'cf': int, 'cd': int, 'bid': int, } 

63 

64# -------------------------------------------------------- 

65# Markdown tables extractor 

66# -------------------------------------------------------- 

67 

68def apply_converters(row, conv, nan_error=False): 

69 for i, c in conv: 

70 try: 

71 row[i] = c(row[i]) 

72 except: 

73 if nan_error: 

74 row[i] = pd.np.NaN 

75 

76 # always convert NaN strings to np.nan 

77 for i, x in enumerate(row): 

78 if x in ('NaN', ): 

79 row[i] = pd.np.NaN 

80 return row 

81 

82 

83def parse_fields(row, header, env, nan_error=False): 

84 # 1 step: parse all rows known format and update context 

85 idx_pos = None 

86 for i, fmt in enumerate(header): 

87 try: 

88 d = _regx_param.match(fmt).groupdict() 

89 info = parse.parse(d['format'], row[i]) 

90 if info: 

91 env.update(info.named) 

92 # we asume there's only 1 named params or at least, the result is the last one 

93 for key, value in info.named.items(): 

94 row[i] = value 

95 elif d['idx']: 

96 return None # is an index and we can not parse value 

97 

98 except Exception as why: 

99 if nan_error: 

100 row[i] = pd.np.NaN 

101 

102 for i, t_value in enumerate(row): 

103 if isinstance(t_value, str): 

104 m = _regx_eval.match(t_value) 

105 if m: 

106 exp = m.group(2) 

107 try: 

108 row[i] = eval(exp, env) 

109 except Exception as why: 

110 foo = 1 

111 return row 

112 

113def prepare_converters(columns, converters): 

114 conv = list() 

115 for i, field in enumerate(columns): 

116 c = converters.get(field) 

117 if c: 

118 conv.append((i, c)) 

119 return conv 

120 

121def parse_df_header(header): 

122 if isinstance(header, str): 

123 header = [scalar.strip() for scalar in header.strip().strip('|').split('|')] 

124 

125 idx0 = idx1 = None 

126 header0 = list(header) 

127 

128 for i, col in enumerate(header0): 

129 m = _regx_param.match(col) 

130 if m: 

131 d = m.groupdict() 

132 header0[i] = d['param'] 

133 if d['idx']: 

134 idx0, idx1 = col, d['param'] 

135 else: 

136 header0[i] = col 

137 

138 header1 = list(header0) 

139 if idx1: 

140 header1.remove(idx1) 

141 

142 return idx0, idx1, header0, header1 

143 

144def set_df_index(df, header): 

145 idx0, idx1, _, _ = parse_df_header(header) 

146 if idx1: 

147 df.rename({idx0: idx1}, axis='columns', inplace=True) 

148 df.set_index(idx1, inplace=True) 

149 

150 

151def Markdown_extractor(content, converters=default_converters, env=None, nan_error=False, what=['table']): 

152 """Extact elements from MD formated content.""" 

153 last = None 

154 context = dict() 

155 header_fmt = header0 = header1 = None 

156 env = env if env is not None else dict() 

157 

158 stream = StringIO(content) 

159 

160 while not stream.closed: 

161 rows = table_reader(stream) 

162 data = list() 

163 for header_fmt in rows: 

164 idx0, idx1, header0, header1 = parse_df_header(header_fmt) 

165 break 

166 else: 

167 break # not header found, stop reading 

168 

169 for row in rows: 

170 row = parse_fields(row, header_fmt, env) 

171 row and data.append(row) 

172 

173 df = pd.DataFrame(data, columns=header0) 

174 if idx1: 

175 df.set_index(idx1, inplace=True) 

176 yield df 

177 

178def table_reader(stream): 

179 """Parse a table in MD format. 

180 Returns header and all rows, then exit. 

181 The stream is moved forward to next MD content. 

182 """ 

183 header = None 

184 for line in stream.readlines(): 

185 row = [scalar.strip() for scalar in line.strip().strip('|').split('|')] 

186 

187 # check if is a header start 

188 c = ''.join(row) 

189 if not header: 

190 if c: 

191 if not c.strip('-').strip(':'): 

192 header = last 

193 yield header 

194 else: 

195 if line.strip(): 

196 yield row 

197 else: 

198 break 

199 last = row 

200 

201def df_asdict(df): 

202 name = df.index.name 

203 for idx, serie in df.iterrows(): 

204 d = dict(zip(serie.index, serie.values)) 

205 d[name] = idx 

206 yield d 

207 

208def list_asdict(lines, header): 

209 for row in lines: 

210 d = dict(zip(header, row)) 

211 yield d 

212 

213def twin_iter(a, b): 

214 keys = list(a.keys()) 

215 keys.sort() 

216 for k in keys: 

217 yield k, a[k], b[k] 

218 

219class Match(object): 

220 def __init__(self, df, content, env): 

221 self.df = df 

222 # self.df.reset_index() # to add index into context values 

223 self.content = content 

224 self.env = env 

225 

226 def match(self): 

227 for step in ['capture', 'eval']: 

228 stream = StringIO(self.content) 

229 lines = table_reader(stream) 

230 

231 for header in lines: 

232 idx0, idx1, header0, header1 = parse_df_header(header) 

233 break 

234 env = self.env 

235 

236 # iterate df rows and template rows one by one 

237 for i, (e_row, t_row) in enumerate( 

238 zip(df_asdict(self.df), list_asdict(lines, header0))): 

239 

240 a, b = list(e_row.keys()), list(t_row.keys()) 

241 a.sort() 

242 b.sort() 

243 assert a == b 

244 

245 # pass 1: cast to same type and capture 

246 env.update(e_row) 

247 

248 pending = list() 

249 for key, e_value, t_value in twin_iter(e_row, t_row): 

250 # - ignore <> expressions 

251 m = _regx_eval.match(t_value) 

252 if m: 

253 pending.append((key, m.group(2))) 

254 continue 

255 

256 # - try to capture variable from e_row 

257 m = _regx_param.match(t_value) 

258 if m: 

259 d = m.groupdict() 

260 env[d['param']] = t_row[key] = e_row[key] 

261 continue 

262 

263 # - try to cast value to same class. NaN is convertted as well 

264 # e_value = e_row[key] 

265 try: 

266 t_row[key] = e_value.__class__(t_value) 

267 continue 

268 except Exception as why: 

269 print(why) 

270 foo = 1 

271 

272 if step in ('eval', ): 

273 # pass 2: expand <vars> 

274 for key, exp in pending: 

275 try: 

276 t_row[key] = eval(exp, env) 

277 except Exception as why: 

278 foo = 1 

279 

280 # compare 

281 if t_row != e_row: 

282 diff = list() 

283 diff.append(f"{'key':9} {'Expected':>8} --- {'Observed':<8}") 

284 for key, e_value, t_value in twin_iter(e_row, t_row): 

285 if e_value != t_value: 

286 if isinstance(e_value, float) and not pd.np.isnan(e_value): 

287 diff.append(f"- {key:6}: {e_value:>8} != {t_value:<8}") 

288 

289 diff = '\n'.join(diff) 

290 error = f"""*** ERROR *** 

291 {self.df} 

292 

293 row: {i} 

294 {diff} 

295 """ 

296 print(error) 

297 return False 

298 return True 

299 

300 

301 

302def iter_df(df, converters=default_converters, nan_error=False): 

303 conv = list() 

304 

305 # prepare converters 

306 if df.index.name: 

307 fields = [df.index.name] + list(df.columns) 

308 conv = prepare_converters(fields, converters) 

309 else: 

310 fields = list(df.columns) 

311 conv = prepare_converters(fields, converters) 

312 

313 for idx, row in df.iterrows(): 

314 row = [idx] + list(row) 

315 row = apply_converters(row, conv, nan_error) 

316 yield row 

317 

318 

319 

320# -------------------------------------------------------- 

321# Check internal table structures in the middle of algorithm 

322# -------------------------------------------------------- 

323 

324async def inject_events(df, hub, key, klass, converters=default_converters, rate=10, 

325 pub_keys=['{key}', '/test{key}', ]): 

326 """Inject events from a df into Hub using a key 

327 converting rows values prior building a klass. 

328 

329 Allow 

330 """ 

331 ctx = locals() 

332 pub_keys = [k.format(**ctx) for k in pub_keys] 

333 

334 s = 1 / (rate * len(pub_keys)) 

335 s = 1 / rate 

336 

337 for row in iter_df(df, converters): 

338 record = klass(*row) 

339 # print(record) 

340 

341 for i, k in enumerate(pub_keys): 

342 if i > 0: 

343 priority = 15 

344 else: 

345 priority = 1 

346 await asyncio.sleep(s) 

347 

348 join = asyncio.Event() 

349 hub.publish(k, record, priority=priority, join=join) 

350 await join.wait() 

351 foo = 1 

352 foo = 1 

353 

354 

355async def inject_events_using_date(df, hub, key, klass, date='date', converters=None, speed=1): 

356 """Inject events from a df into Hub using a key 

357 converting rows values prior building a klass 

358 """ 

359 

360 now = None 

361 for row in iter_df(df, converters): 

362 record = klass(*row) 

363 when = getattr(record, date) 

364 

365 if now: 

366 dt = when - now 

367 delay = dt.seconds / speed 

368 await asyncio.sleep(delay) 

369 

370 now = when 

371 hub.publish(key, record) 

372 

373 

374async def await_until(condition, timeout=10, sampling=10, extra=0): 

375 # func = get_calling_function() 

376 frame = sys._getframe(1) 

377 context = dict(frame.f_locals) 

378 

379 t0 = time.time() 

380 s = 1 / sampling 

381 while time.time() - t0 < timeout: 

382 try: 

383 r = eval(condition, context) 

384 if r: 

385 break 

386 except: 

387 pass 

388 await asyncio.sleep(s) 

389 else: 

390 raise TimeoutError("wait_until() failled") 

391 

392 await asyncio.sleep(extra) 

393 

394 

395 

396def parse_df_states(states, converters=default_converters): 

397 """Parse a dict of internal states in Markdown to check internal status 

398 during the algorithm evolution. 

399 """ 

400 df_states = dict() 

401 for seq, state in states.items(): 

402 for df in Markdown_extractor(state, converters): 

403 df_states[seq] = df 

404 return df_states 

405 

406 

407 

408class InternalStatusMonitor(object): 

409 def __init__(self, hub, key, expected_status, obj, df_attr='df', seq_attr='seq', env=None): 

410 self.hub = hub 

411 self.key = f'/test{key}' 

412 self.expected_status = expected_status 

413 self.obj = obj # where is the df to be checked 

414 self.df_attr = df_attr 

415 self.seq_attr = seq_attr 

416 

417 self.env = env or dict() 

418 self.result = None 

419 

420 self.hub.subscribe(self.key, self.check_df_status) 

421 

422 async def check_df_status(self, key, data): 

423 if self.result: 

424 return # don't process any check when an error is happend 

425 

426 value = getattr(data, self.seq_attr) 

427 expected = self.expected_status.get(value) 

428 if expected is not None: 

429 observed = getattr(self.obj, self.df_attr) 

430 

431 m = Match(observed, expected, self.env) 

432 r = m.match() 

433 if not r: 

434 self.result = RuntimeError(f"*** ERROR: internal status differ in {self.seq_attr}: {value}, data: {data}") 

435 print(observed) 

436 print(expected) 

437 raise self.result 

438 else: 

439 print(f"OK: internal status {value} match") 

440 foo = 1 # ok 

441 foo = 1 

442 

443 

444async def execute_supervision_test(states, events, record_klass, key, instance, converters=default_converters, env=None): 

445 

446 app = instance.app 

447 hub = app.hub 

448 await app.start() 

449 

450 # df_states = parse_df_states(states, converters) 

451 

452 t0 = time.time() 

453 supervisor = InternalStatusMonitor(hub, key, states, instance, env=env) 

454 

455 for df in Markdown_extractor(events, env=env): 

456 # remove rows with no sequence (NaN) 

457 # df = df[df.index > ''] 

458 # await inject_events(df, hub, key, klass, converters, rate=4) 

459 # await inject_events_using_date(df, hub, key, klass, 'date', converters, speed=10) 

460 await inject_events(df, hub, key, record_klass, rate=1000) 

461 

462 # await await_until('hub._queue.empty()', extra=0.25) 

463 elapsed = time.time() - t0 

464 print(f"Elapsed: {elapsed}") 

465 if supervisor.result: 

466 raise supervisor.result 

467 

468 await app.stop() 

469 

470 foo = 1 

471 

472class InjectorTest(object): 

473 def __init__(self, app, events, expected, env=None): 

474 self.app = app 

475 self.events = events 

476 self.expected = expected 

477 self.env = env 

478 

479 # runtime 

480 self.result = None 

481 self.test_key = None 

482 self.checked = None 

483 

484 async def run(self, timers=True): 

485 """ 

486 timers = True : timers are silenced but defined in events 

487 """ 

488 app = self.app 

489 hub = app.hub 

490 

491 self.test_key = f'/test/run/{random.randint(0, 10**6)}' 

492 hub.subscribe(f'{self.test_key}/.*', self._check_status) 

493 

494 await app.start() 

495 

496 if not timers: # avoid install timers but mine 

497 hub._task_timers.cancel() 

498 

499 pub_keys = ['{key}', f'{self.test_key}/{{stage}}{{key}}'] 

500 

501 t0 = time.time() 

502 for events in Markdown_extractor(self.events, env=self.env): 

503 await self.inject_events(events, pub_keys, rate=1000) 

504 

505 elapsed = time.time() - t0 

506 print(f"Elapsed: {elapsed}") 

507 if self.result: 

508 raise self.result 

509 

510 await app.stop() 

511 foo = 1 

512 

513 async def inject_events(self, events, pub_keys, rate=10): 

514 hub = self.app.hub 

515 ctx = locals() 

516 s = 1 / rate 

517 env = self.env 

518 

519 self.checked = asyncio.Event(loop=hub._loop) 

520 join = asyncio.Event(loop=hub._loop) 

521 

522 for seq, stage, key, message, date in iter_df(events): 

523 await asyncio.sleep(s) 

524 try: 

525 m = _regx_eval.match(message) 

526 if m: 

527 message = eval(m.group(2), env) 

528 else: 

529 message = yaml.load(message) 

530 except Exception as why: 

531 foo = 1 

532 

533 for i, k in enumerate(pub_keys): 

534 k = k.format(**locals()) 

535 self.checked.clear() 

536 join.clear() 

537 hub.publish(k, message, join=join) 

538 await join.wait() # wait event has been processed 

539 

540 await self.checked.wait() # wait check has been done 

541 foo = 1 

542 

543 foo = 1 

544 

545 async def _check_status(self, key, data): 

546 try: 

547 if self.result: 

548 return # don't process any check when an error is happend 

549 

550 env = self.env 

551 stage = key.split(self.test_key)[1].split('/')[1] 

552 

553 expected = self.expected.get(stage) or {} 

554 for exp, status in expected.items(): 

555 try: 

556 observed = eval(exp, env) 

557 except Exception as why: 

558 foo = 1 

559 

560 m = Match(observed, status, env) 

561 r = m.match() 

562 

563 if not r: 

564 self.result = RuntimeError(f"*** ERROR: internal status differ in {self.seq_attr}: {value}, data: {data}") 

565 print(observed) 

566 print(status) 

567 raise self.result 

568 else: 

569 print(f"Internal status '{stage}' Ok") 

570 foo = 1 # ok 

571 finally: 

572 self.checked.set() 

573 

574 

575 # ------------------------------------------------------ 

576 # helpers 

577 # ------------------------------------------------------ 

578 def _no_timers_subscribe(self, pattern, callback, duplicate=False, single=False): 

579 uri_ = parse_uri(pattern) 

580 if uri_['scheme'] in ('timer', ): 

581 return 

582 self.org_subscribe(pattern, callback, duplicate, single) 

583 

584 

585 

586 

587# ----------------------------------------------------- 

588# timeit 

589# ----------------------------------------------------- 

590import timeit 

591import os 

592from datetime import datetime 

593def speed_meter(N=None, label=None, **test,): 

594 label = label or '{stmt}'.format(**test) 

595 elapsed = timeit.repeat(**test) 

596 elapsed.sort() 

597 n = test.get('repeat', 5) // 3 

598 elapsed = sum(elapsed[:n]) / n 

599 if N: 

600 speed = N / elapsed 

601 else: 

602 speed = None 

603 

604 test['label'] = label 

605 test['speed'] = speed 

606 test['elapsed'] = elapsed 

607 test['now'] = now = datetime.now() 

608 test['now_txt'] = now_txt = now.strftime('%Y-%m-%dT%H:%M:%S') 

609 

610 _debug_ = set(sys.executable.split(os.path.sep)).\ 

611 intersection(set(['wingdb'])) 

612 _debug_ = len(_debug_) > 0 

613 

614 username = os.getenv('USERNAME') 

615 with open(f'/tmp/{username}-speed_meter.csv', mode='a') as f: 

616 line = f'{now_txt:}, {N}, {elapsed:1.5f}, {speed:e}, {label}, {_debug_}\n' 

617 f.write(line) 

618 return test 

619 

620