diff --git a/scripts/mseeds_to_seisbench.py b/scripts/mseeds_to_seisbench.py index 035b360..9248fdd 100644 --- a/scripts/mseeds_to_seisbench.py +++ b/scripts/mseeds_to_seisbench.py @@ -1,54 +1,30 @@ +""" +----------------- +Copyright © 2023 ACK Cyfronet AGH, Poland. +This work was partially funded by EPOS Project funded in frame of PL-POIR4.2 +----------------- +""" + import os import pandas as pd -import glob -from pathlib import Path import obspy from obspy.core.event import read_events -import seisbench import seisbench.data as sbd import seisbench.util as sbu -import sys + import logging import argparse - logging.basicConfig(filename="output.out", filemode='a', format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s', datefmt='%H:%M:%S', level=logging.DEBUG) - - logger = logging.getLogger('converter') -def create_traces_catalog(directory, years): - for year in years: - directory = f"{directory}/{year}" - files = glob.glob(directory) - traces = [] - for i, f in enumerate(files): - st = obspy.read(f) - - for tr in st.traces: - # trace_id = tr.id - # start = tr.meta.starttime - # end = tr.meta.endtime - - trs = pd.Series({ - 'trace_id': tr.id, - 'trace_st': tr.meta.starttime, - 'trace_end': tr.meta.endtime, - 'stream_fname': f - }) - traces.append(trs) - - traces_catalog = pd.DataFrame(pd.concat(traces)).transpose() - traces_catalog.to_csv("data/bogdanka/traces_catalog.csv", append=True, index=False) - - def split_events(events, input_path): logger.info("Splitting available events into train, dev and test sets ...") @@ -61,6 +37,7 @@ def split_events(events, input_path): for pick in event.picks: trace_params = get_trace_params(pick) trace_path = get_trace_path(input_path, trace_params) + if os.path.isfile(trace_path): actual_picks += 1 @@ -87,7 +64,6 @@ def get_event_params(event): origin = event.preferred_origin() if origin is None: return {} - # print(origin) mag = event.preferred_magnitude() @@ -115,12 +91,10 @@ def get_event_params(event): def get_trace_params(pick): - net = pick.waveform_id.network_code - sta = pick.waveform_id.station_code trace_params = { - "station_network_code": net, - "station_code": sta, + "station_network_code": pick.waveform_id.network_code, + "station_code": pick.waveform_id.station_code, "trace_channel": pick.waveform_id.channel_code, "station_location_code": pick.waveform_id.location_code, "time": pick.time @@ -134,7 +108,6 @@ def find_trace(pick_time, traces): if pick_time > tr.stats.endtime: continue if pick_time >= tr.stats.starttime: - # print(pick_time, " - selected trace: ", tr) return tr logger.warning(f"no matching trace for peak: {pick_time}") @@ -151,13 +124,25 @@ def get_trace_path(input_path, trace_params): path = f"{input_path}/{year}/{net}/{station}/{tr_channel}.D/{net}.{station}..{tr_channel}.D.{year}.{day_of_year}" return path +def get_three_channels_trace_paths(input_path, trace_params): + year = trace_params["time"].year + day_of_year = pd.Timestamp(str(trace_params["time"])).day_of_year + net = trace_params["station_network_code"] + station = trace_params["station_code"] + + paths = [] + for channel in ["EHE", "EHN", "EHZ"]: + paths.append(f"{input_path}/{year}/{net}/{station}/{channel}.D/{net}.{station}..{channel}.D.{year}.{day_of_year}") + + return paths + def load_trace(input_path, trace_params): trace_path = get_trace_path(input_path, trace_params) trace = None if not os.path.isfile(trace_path): - logger.w(trace_path + " not found") + logger.warning(trace_path + " not found") else: stream = obspy.read(trace_path) if len(stream.traces) > 1: @@ -171,14 +156,20 @@ def load_trace(input_path, trace_params): def load_stream(input_path, trace_params, time_before=60, time_after=60): - trace_path = get_trace_path(input_path, trace_params) sampling_rate, stream = None, None pick_time = trace_params["time"] - if not os.path.isfile(trace_path): - print(trace_path + " not found") - else: - stream = obspy.read(trace_path) + trace_paths = get_three_channels_trace_paths(input_path, trace_params) + for trace_path in trace_paths: + if not os.path.isfile(trace_path): + logger.warning(trace_path + " not found") + else: + if stream is None: + stream = obspy.read(trace_path) + else: + stream += obspy.read(trace_path) + + if stream is not None: stream = stream.slice(pick_time - time_before, pick_time + time_after) if len(stream.traces) == 0: print(f"no data in: {trace_path}") @@ -210,13 +201,23 @@ def convert_mseed_to_seisbench_format(input_path, catalog_path, output_path): "dimension_order": "CW", "component_order": "ZNE", } + for i, event in enumerate(events): logger.debug(f"Converting {i} event") event_params = get_event_params(event) event_params["split"] = events_stats.loc[i, "split"] + picks_per_station = {} for pick in event.picks: - trace_params = get_trace_params(pick) + station = pick.waveform_id.station_code + if station in picks_per_station: + picks_per_station[station].append(pick) + else: + picks_per_station[station] = [pick] + + for picks in picks_per_station.values(): + + trace_params = get_trace_params(picks[0]) sampling_rate, stream = load_stream(input_path, trace_params) if stream is None: continue @@ -229,15 +230,14 @@ def convert_mseed_to_seisbench_format(input_path, catalog_path, output_path): trace_params["trace_sampling_rate_hz"] = sampling_rate trace_params["trace_start_time"] = str(actual_t_start) - pick_time = obspy.core.utcdatetime.UTCDateTime(trace_params["time"]) - pick_idx = (pick_time - actual_t_start) * sampling_rate - - trace_params[f"trace_{pick.phase_hint}_arrival_sample"] = int(pick_idx) + for pick in picks: + pick_time = obspy.core.utcdatetime.UTCDateTime(pick.time) + pick_idx = (pick_time - actual_t_start) * sampling_rate + trace_params[f"trace_{pick.phase_hint}_arrival_sample"] = int(pick_idx) writer.add_trace({**event_params, **trace_params}, data) - if __name__ == "__main__": parser = argparse.ArgumentParser(description='Convert mseed files to seisbench format')