import os import pandas as pd import glob from pathlib import Path import obspy from obspy.core.event import read_events import seisbench.data as sbd import seisbench.util as sbu import sys import logging logging.basicConfig(filename="output.out", filemode='a', format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s', datefmt='%H:%M:%S', level=logging.DEBUG) logger = logging.getLogger('converter') def create_traces_catalog(directory, years): for year in years: directory = f"{directory}/{year}" files = glob.glob(directory) traces = [] for i, f in enumerate(files): st = obspy.read(f) for tr in st.traces: # trace_id = tr.id # start = tr.meta.starttime # end = tr.meta.endtime trs = pd.Series({ 'trace_id': tr.id, 'trace_st': tr.meta.starttime, 'trace_end': tr.meta.endtime, 'stream_fname': f }) traces.append(trs) traces_catalog = pd.DataFrame(pd.concat(traces)).transpose() traces_catalog.to_csv("data/bogdanka/traces_catalog.csv", append=True, index=False) def split_events(events, input_path): logger.info("Splitting available events into train, dev and test sets ...") events_stats = pd.DataFrame() events_stats.index.name = "event" for i, event in enumerate(events): #check if mseed exists actual_picks = 0 for pick in event.picks: trace_params = get_trace_params(pick) trace_path = get_trace_path(input_path, trace_params) if os.path.isfile(trace_path): actual_picks += 1 events_stats.loc[i, "pick_count"] = actual_picks events_stats['pick_count_cumsum'] = events_stats.pick_count.cumsum() train_th = 0.7 * events_stats.pick_count_cumsum.values[-1] dev_th = 0.85 * events_stats.pick_count_cumsum.values[-1] events_stats['split'] = 'test' for i, event in events_stats.iterrows(): if event['pick_count_cumsum'] < train_th: events_stats.loc[i, 'split'] = 'train' elif event['pick_count_cumsum'] < dev_th: events_stats.loc[i, 'split'] = 'dev' else: break return events_stats def get_event_params(event): origin = event.preferred_origin() if origin is None: return {} # print(origin) mag = event.preferred_magnitude() source_id = str(event.resource_id) event_params = { "source_id": source_id, "source_origin_uncertainty_sec": origin.time_errors["uncertainty"], "source_latitude_deg": origin.latitude, "source_latitude_uncertainty_km": origin.latitude_errors["uncertainty"], "source_longitude_deg": origin.longitude, "source_longitude_uncertainty_km": origin.longitude_errors["uncertainty"], "source_depth_km": origin.depth / 1e3, "source_depth_uncertainty_km": origin.depth_errors["uncertainty"] / 1e3 if origin.depth_errors[ "uncertainty"] is not None else None, } if mag is not None: event_params["source_magnitude"] = mag.mag event_params["source_magnitude_uncertainty"] = mag.mag_errors["uncertainty"] event_params["source_magnitude_type"] = mag.magnitude_type event_params["source_magnitude_author"] = mag.creation_info.agency_id if mag.creation_info is not None else None return event_params def get_trace_params(pick): net = pick.waveform_id.network_code sta = pick.waveform_id.station_code trace_params = { "station_network_code": net, "station_code": sta, "trace_channel": pick.waveform_id.channel_code, "station_location_code": pick.waveform_id.location_code, "time": pick.time } return trace_params def find_trace(pick_time, traces): for tr in traces: if pick_time > tr.stats.endtime: continue if pick_time >= tr.stats.starttime: # print(pick_time, " - selected trace: ", tr) return tr logger.warning(f"no matching trace for peak: {pick_time}") return None def get_trace_path(input_path, trace_params): year = trace_params["time"].year day_of_year = pd.Timestamp(str(trace_params["time"])).day_of_year net = trace_params["station_network_code"] station = trace_params["station_code"] tr_channel = trace_params["trace_channel"] path = f"{input_path}/{year}/{net}/{station}/{tr_channel}.D/{net}.{station}..{tr_channel}.D.{year}.{day_of_year}" return path def load_trace(input_path, trace_params): trace_path = get_trace_path(input_path, trace_params) trace = None if not os.path.isfile(trace_path): logger.w(trace_path + " not found") else: stream = obspy.read(trace_path) if len(stream.traces) > 1: trace = find_trace(trace_params["time"], stream.traces) elif len(stream.traces) == 0: logger.warning(f"no data in: {trace_path}") else: trace = stream.traces[0] return trace def load_stream(input_path, trace_params, time_before=60, time_after=60): trace_path = get_trace_path(input_path, trace_params) sampling_rate, stream = None, None pick_time = trace_params["time"] if not os.path.isfile(trace_path): print(trace_path + " not found") else: stream = obspy.read(trace_path) stream = stream.slice(pick_time - time_before, pick_time + time_after) if len(stream.traces) == 0: print(f"no data in: {trace_path}") else: sampling_rate = stream.traces[0].stats.sampling_rate return sampling_rate, stream def convert_mseed_to_seisbench_format(): input_path = "/net/pr2/projects/plgrid/plggeposai" logger.info("Loading events catalog ...") events = read_events(input_path + "/BOIS_all.xml") events_stats = split_events(events) output_path = input_path + "/seisbench_format" metadata_path = output_path + "/metadata.csv" waveforms_path = output_path + "/waveforms.hdf5" with sbd.WaveformDataWriter(metadata_path, waveforms_path) as writer: writer.data_format = { "dimension_order": "CW", "component_order": "ZNE", } for i, event in enumerate(events): logger.debug(f"Converting {i} event") event_params = get_event_params(event) event_params["split"] = events_stats.loc[i, "split"] # b = False for pick in event.picks: trace_params = get_trace_params(pick) sampling_rate, stream = load_stream(input_path, trace_params) if stream is None: continue actual_t_start, data, _ = sbu.stream_to_array( stream, component_order=writer.data_format["component_order"], ) trace_params["trace_sampling_rate_hz"] = sampling_rate trace_params["trace_start_time"] = str(actual_t_start) pick_time = obspy.core.utcdatetime.UTCDateTime(trace_params["time"]) pick_idx = (pick_time - actual_t_start) * sampling_rate trace_params[f"trace_{pick.phase_hint}_arrival_sample"] = int(pick_idx) writer.add_trace({**event_params, **trace_params}, data) if __name__ == "__main__": convert_mseed_to_seisbench_format() # create_traces_catalog("/net/pr2/projects/plgrid/plggeposai/", ["2018", "2019"])