#!/usr/bin/env python
import time
import h5py
from gwpy.timeseries import TimeSeries, TimeSeriesDict
from gwpy.timeseries import TimeSeries
import pandas as pd
from math import floor
import concurrent.futures
import itertools as itrtls
import os
import sys
import numpy as np
from scipy import stats
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
import origli.utilities.const as Const # for multi frequency bands defined in
from origli.utilities.utilities import select_some_trials
from origli.utilities.utilities import IdentifyGlitch, FindRadomlistPoints
#------------------------------
# for selecting the null samples
#--------------------------
[docs]def FindBGlis_extendBG(state, number_trials, step, outputMother_dir, df, Epoch_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UpperDurationThresh, LowerDurationThresh, UserDefinedDuration, gap, TriggerPeakFreqLowerCutoff=0, TriggerPeakFreqUpperCutoff=8192, targetUpperSNR_thre=np.inf):
'''
description: get the null samples where their durations are drawn from the target set. The only subset of the null samples are chosen if their on-source windows do not coincide with any other glitches.
1. gltich file (.csv file)
2. the target samples
3. create random time stamps with their durations drawn from the target set
4. accept the random time stamps where their on-source windows do not coincide with any other glitches
5. return the info of the accepted glitches
USAGE: Listsegments = FindBGlis_extendBG(state, number_trials, step, outputMother_dir, df, Epoch_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UpperDurationThresh, LowerDurationThresh, UserDefinedDuration, gap, TriggerPeakFreqLowerCutoff, TriggerPeakFreqUpperCutoff, targetUpperSNR_thre)
:param df: GravitySpy meta data in pandas format
:param Epochstart: starting time of an epoch
:param Epochend: end time of an epoch
:param Commissioning_lt: commissioning time in list
:param TargetGlitchClass: a target glitch class name (str)
:param IFO: a type of interferometer (H1, L1, V1) (str)
:param BGSNR_thre: an upper threhold of SNR for background glitches, i.e., quiet enough ), float or int
:param targetSNR_thre: a lower threshold of SNR for target glitches, float or int
:param Confidence_thre: a threshold of confidence level (float or int )
:param UpperDurationThresh: an upper bound of duration in sec (float or int)
:param LowerDurationThresh: a lower bound of duration in sec (float or int)
:param UserDefinedDuration: user defined duration of a glitch (float or int), 0 in default
:param gap: a time gap between the target and the background segments in sec, 1 sec in default
:param TriggerPeakFreqUpperCutoff: the lower SNR threshold for selecting the target set
:param targetUpperSNR_thre: the upper SNR threshold for selecting the target set
:return: the list of parameters of glitches passing the above thresholds
Listsegments contains of
ListIndexSatisfied: a list of index of glitches
Listtarget_timeseries_start: a list of a target glitch starting time
Listtarget_timeseries_end: a list of a target glitch ending time
Listpre_background_start: a list of preceding background starting time
Listpre_background_end; a list of preceding background ending time
Listfol_background_start: a list of following background starting time
Listfol_background_end: a list of following background ending time
Listgpstime: a list of GPS times
Listduration: a list of durations
ListSNR: a list of SNRs
Listconfi: a list of confidence levels
ListID: a list of IDs
'''
df_list = []
# take the data in an Epoch
for Epoch in Epoch_lt:
StartT, EndT = Epoch[0], Epoch[1]
df_epoch = df[(df['GPStime'] >= StartT) & (df['GPStime'] <= EndT)]
df_list.append(df_epoch)
df_int = pd.concat(df_list)
# data set containing only a target glitch class
try: # gravity spy and omicron trigger mode
df_target = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['peakFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['peakFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]
except KeyError: # pyCBC mode as there is no column "peakFreq"
df_target = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['centralFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['centralFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]
#for cWB
#df_target = df_int[(df_int['rho'] > 9) & (df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['centralFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['centralFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]
# generate synthetic data points
df_random = FindRadomlistPoints(state, IFO, Epoch_lt, number_trials, step, outputMother_dir, df_target[df_target.snr > BGSNR_thre])
#df_random = FindShiftedPoints(state, IFO, Epoch_lt, number_trials, step, outputMother_dir, df_target[df_target.snr > BGSNR_thre])
df_int = df_int.append(df_random, ignore_index=True) # append the dataset of random points
#### improvement for duration 2019-07-02 #############
#### full durations are too long for capturing the glitch signature so that the solution is to choose T90
### In this situation, a gap that is manually chosen will be gap_new = gap_m + 0.05*T100
### This function is for getting quite segments so that I concervatively choose the longest fraction of duration, in default
FractionEnergy = 1 # (100% of energy )
# note (1 - FractionEnergy)/2. is the fraction of the one-sided off T90
df_int = df_int.assign(gap=gap + (1 - FractionEnergy) / 2. * df_int['duration'])
df_int['duration'] = np.round(FractionEnergy * df_int['duration'], 3) # replace full durations (T100) with T90
# make the time segments of glitches measured
# I reserve duration +/- gap around a GPS time of a glitch
if UserDefinedDuration > 0: # accept a user defined duration
# assign this user defined duration for any types of glieches observed in a given IFO
df_int.loc[(df_int['ifo'] == IFO), ['duration']] = UserDefinedDuration
# df_int.loc[(df_int['ifo'] == IFO) & (df_int['label'] == 'null'), ['duration']] = UserDefinedDuration
else: # otherwise, keep the duration that are determined by an event trigger generator (ETG)
pass
df_int = df_int.assign(reserved_time_start=df_int['GPStime'] - df_int['duration'] / 2. - df_int['gap'])
df_int = df_int.assign(reserved_time_end=df_int['GPStime'] + df_int['duration'] / 2. + df_int['gap'])
# data set of randomly chosen background
df_glitch = df_int[df_int.label == 'null'] # synthetic background trigger has SNR of zero
# exclude glitches with SNR <= BGSNR from df_int as they are quiet enough to be considered as background
df_int = df_int[df_int['snr'] > BGSNR_thre]
print('all the glitches in the class: {}'.format(df_glitch.shape[0]))
# the list of values satisfied as secure
ListIndexSatisfied = []
Listtarget_timeseries_start = []
Listtarget_timeseries_end = []
Listpre_background_start = []
Listpre_background_end = []
Listfol_background_start = []
Listfol_background_end = []
Listgpstime = []
Listduration = []
ListSNR = []
Listconfi = []
ListID = []
for ind in range(len(df_glitch)):
# target glitch's peak GPS time and the duration
gpstime = df_glitch['GPStime'].iloc[ind] # a peak GPS time
duration = df_glitch['duration'].iloc[ind] # # a duration
SNR = df_glitch['snr'].iloc[ind] # snr
confi = df_glitch['confidence'].iloc[ind] # confidence level
ID = df_glitch['id'].iloc[ind] # ID
gap_individual = df_glitch['gap'].iloc[ind]
if duration > 0.02: # accept the duration of glitch is greater than 0.02 sec
pass # go to the next line
else: # if the duration of a glitch is less than 0.01 sec
continue # go to the next iteration
# the starting and ending time of the target glitch
# at this moment I assume that the target glitch is distributed evenly around the GPS time
target_timeseries_start = gpstime - duration / 2.
target_timeseries_end = target_timeseries_start + duration
# take the background
# the background is around the target glitch
# preceding background, which is a nearest neighbor segment before target_timeseries_start
# following background, which is a nearest neighbor segment after target_timeseries_end
background_duration = 64 # duration # duration of both the proceeding and following background
pre_background_start = target_timeseries_start - gap_individual - background_duration
pre_background_end = pre_background_start + background_duration # target_timeseries_start - gap_individual
fol_background_start = target_timeseries_end + gap_individual
fol_background_end = fol_background_start + background_duration # target_timeseries_end + gap_individual + background_duration
# security check of the preceding background
# case 1 is where the background is coincident with the right edge of the reserved segment
# case 2 is where the background is coincident with the left edge of the reserved segment
# case 3 is where the reserved segment is inside of the background
# case 4 is where the background is inside of the reserved segment
# it violates the security of the background estimation if at least one of the cases occurs
# i.e., pro_case1, 2, 3, and 4 should be zero
case1 = len(df_int[(df_int['reserved_time_end'] > target_timeseries_start) & (df_int['reserved_time_end'] < target_timeseries_end)]) # case 1
case2 = len(df_int[(df_int['reserved_time_start'] > target_timeseries_start) & (df_int['reserved_time_start'] < target_timeseries_end)]) # case 2
case3 = len(df_int[(df_int['reserved_time_start'] > target_timeseries_start) & (df_int['reserved_time_end'] < target_timeseries_end)]) # case 3
case4 = len(df_int[(df_int['reserved_time_start'] < target_timeseries_start) & (df_int['reserved_time_end'] > target_timeseries_end)]) # case 4
if case1 == 0 and case2 == 0 and case3 == 0 and case4 == 0:
print('{:.1f}, {:.1f}, {:.1f}'.format(pre_background_end - pre_background_start,
fol_background_end - fol_background_start, duration))
ListIndexSatisfied.append(ind) # append the index satisfied as secure
Listtarget_timeseries_start.append(target_timeseries_start)
Listtarget_timeseries_end.append(target_timeseries_end)
Listpre_background_start.append(pre_background_start)
Listpre_background_end.append(pre_background_end)
Listfol_background_start.append(fol_background_start)
Listfol_background_end.append(fol_background_end)
Listgpstime.append(gpstime)
Listduration.append(duration)
ListSNR.append(SNR)
Listconfi.append(confi)
ListID.append(ID)
else:
continue
# zipped list of segments
Listsegments = list(
zip(ListIndexSatisfied, Listtarget_timeseries_start, Listtarget_timeseries_end, Listpre_background_start,
Listpre_background_end, Listfol_background_start, Listfol_background_end, Listgpstime, Listduration,
ListSNR, Listconfi, ListID))
return Listsegments
#-----------------------------------------------------------------------------------------------------------
# For "burn-in" phase
# The following functions are for making the background upper threshold for the target and null samples
#-----------------------------------------------------------------------------------------------------------
#===== select ====
[docs]def FindRadomlistPointsForBurnIn(state, IFO, Epoch_lt, number_samples, step, duration_max, outputMother_dir):
'''
description: create random time stamps with their durations are uniformly distributed in log10 of 0.02 sec to duration_max
1. within an epoch, create a list of synthetic points with randomly chosen with durations
2. make pandas frame dataset
USAGE: df = FindRadomlistPointsForBurnIn(state, IFO, Epoch_lt, number_samples, step, duration_max, outputMother_dir)
:param state: IFO state {observing, nominal-lock}
:param IFO: an observer {H1, L1}
:param Epoch_lt: a list of epochs
:param number_samples: number of samples picked up
:param step: step of data points in sec
:param duration_max: a maximum value of the duration in sec
:param outputMother_dir: an output directory in witch the data set is in
:return: df: synthetic random data points within an epoch
'''
# -----------------------------------------------------------------------------------------------------+
# get a list of start and end GPS times of an epoch with a given state {'observing', 'nominal-lock'} |
# use IdentifyGlitch.FindObservingTimeSegments() |
# -----------------------------------------------------------------------------------------------------+
# get a glitch identifier object
indentify = IdentifyGlitch()
# get start time and end time of an epoch
startT, endT = Epoch_lt[0][0], Epoch_lt[-1][-1]
# get observing segments
SegmentsMat, trigger_dir = indentify.FindObservingTimeSegments(IFO, startT, endT, outputMother_dir, state)
if state == 'nominal-lcok':
SegmentsMat[:,0] = SegmentsMat[:, 0] + 300 # remove 5 min after the observing mode begins
SegmentsMat[:,1] = SegmentsMat[:, 1] - 60 # remove 1 min before the observing mode ends
elif state == 'observing':
pass
total_seg_used = (SegmentsMat[:,1] - SegmentsMat[:,0]).sum()
if number_samples > int(total_seg_used/float(step)): # the number of samples greater than the maximum number of possible samples
number_samples = int(total_seg_used/float(step)) # set number of samples equal to the maximum number of possible samples
print('User selected number of synthetic samples is greater than the maximum number of possible samples')
print('Set the number of samples equal to the maximum number = {}'.format(number_samples))
#-----------------------------------------------------+
# generate synthetic list of center times of samples |
#-----------------------------------------------------+
whole_list_center_times = np.array([])
for start_seg_used, end_seg_used in SegmentsMat:
length_seg_used = end_seg_used - start_seg_used # length of a segment in sec
number_picked = int(number_samples * length_seg_used / float(total_seg_used)) # number of points to be selected in this segment
# chop this segment by a given time step, then select "number_picked" points out of "length_seg_used/step" points, without duplication
selected_index_center_times = np.random.choice(int(length_seg_used/float(step)), number_picked+1, replace=False) # 1 comes to pick samples a little more than the request at this moment as the sample size might be smaller than than the setting
# calculate a list of center times
list_center_times = np.array([start_seg_used + step * selected_index_center_time for selected_index_center_time in selected_index_center_times])
# sort this list in ascendingly
list_center_times = np.sort(list_center_times)
# append this list of center times to the whole list
whole_list_center_times = np.append(whole_list_center_times, list_center_times)
#whole_list_center_times = whole_list_center_times[:number_samples] # make the sample size equal to the request
# randomly choose the timestamps where the size of the sample equal to the request
whole_list_center_times = np.random.choice(whole_list_center_times, number_samples, replace=False)
# ascending order
whole_list_center_times = np.sort(whole_list_center_times)
#--------------------------------------+
# generate synthetic list of durations |
#--------------------------------------+
duration_min = 0.02 # the minimum duration is set to 0.02 s. This is ok as in this program analysis glitches with duration gerater than 0.01 s
gen_d = 10 ** np.random.uniform(np.log10(duration_min), np.log10(duration_max), number_samples)
# make a pandas dataframe that is compatible with a standard format in my program
df = pd.DataFrame()
df = df.assign(GPStime=whole_list_center_times)
df = df.assign(startGPStime=np.zeros(number_samples))
df = df.assign(peakFreq=np.zeros(number_samples))
df = df.assign(snr=np.zeros(number_samples))
df = df.assign(amplitude=np.zeros(number_samples))
df = df.assign(centralFreq=np.zeros(number_samples))
df = df.assign(duration=gen_d)
df = df.assign(bandwidth=np.zeros(number_samples))
df = df.assign(chisq=np.zeros(number_samples))
df = df.assign(chisqDof=np.zeros(number_samples))
df = df.assign(confidence=np.zeros(number_samples))
df = df.assign(id=[None] * number_samples)
df = df.assign(ifo=['{}'.format(IFO)] * number_samples)
df = df.assign(label=['null'] * number_samples)
df = df.assign(imgUrl=[None] * number_samples)
output_path = os.path.join(trigger_dir, 'RandomSample.csv')
df.to_csv(output_path) # save the data sample
df = pd.read_csv(output_path) # load the file
df.loc[:, 'label'] = 'null' # this is py3 issue, the label of 'null' is read as NaN
return df
[docs]def FindBGlistBurnIn(state, duration_max, number_trials, step, outputMother_dir, df, Epoch_lt, IFO, BGSNR_thre, UserDefinedDuration, gap):
'''
description: From the random time stamps created with FindRadomlistPointsForBurnIn(), select the subset in which their on-source windows do not overlap with any other glitches
Note that if on-source window can be extended, it will do
1. load glitch data set (.csv file)
4. accept time stamps where their on-source windows do not coincide with any other glitches
5. return the info of the accepted glitches
USAGE: Listsegments = FindBGlistBurnIn(state, duration_max, number_trials, step, outputMother_dir, df, Epoch_lt, IFO, BGSNR_thre, UserDefinedDuration, gap)
:param df: GravitySpy meta data in pandas format
:param Epochstart: starting time of an epoch
:param Epochend: end time of an epoch
:param Commissioning_lt: commissioning time in list
:param TargetGlitchClass: a target glitch class name (str)
:param IFO: a type of interferometer (H1, L1, V1) (str)
:param BGSNR_thre: an upper threhold of SNR for background glitches, i.e., quiet enough ), float or int
:param targetSNR_thre: a lower threshold of SNR for target glitches, float or int
:param Confidence_thre: a threshold of confidence level (float or int )
:param UpperDurationThresh: an upper bound of duration in sec (float or int)
:param LowerDurationThresh: a lower bound of duration in sec (float or int)
:param UserDefinedDuration: user defined duration of a glitch (float or int), 0 in default
:param gap: a time gap between the target and the background segments in sec, 1 sec in default
:param flag: 'Both' or 'Either' taking both backgronds or either the preceding or the following background, respectively to accept glitches
:return: the list of parameters of glitches passing the above thresholds
Listsegments contains of
ListIndexSatisfied: a list of index of glitches
Listtarget_timeseries_start: a list of a target glitch starting time
Listtarget_timeseries_end: a list of a target glitch ending time
Listpre_background_start: a list of preceding background starting time
Listpre_background_end; a list of preceding background ending time
Listfol_background_start: a list of following background starting time
Listfol_background_end: a list of following background ending time
Listgpstime: a list of GPS times
Listduration: a list of durations
ListSNR: a list of SNRs
Listconfi: a list of confidence levels
ListID: a list of IDs
'''
df_list = []
# take the data in an Epoch
for Epoch in Epoch_lt:
StartT, EndT = Epoch[0], Epoch[1]
df_epoch = df[(df['GPStime'] >= StartT) & (df['GPStime'] <= EndT)]
df_list.append(df_epoch)
df_int = pd.concat(df_list)
# # data set containing only a target glitch class
# try: # gravity spy and omicron trigger mode
# df_target = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['peakFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['peakFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]
# except KeyError: # pyCBC mode as there is no column "peakFreq"
# df_target = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]
# generate synthetic data points
df_random = FindRadomlistPointsForBurnIn(state, IFO, Epoch_lt, number_trials, step, duration_max, outputMother_dir)
df_int = df_int.append(df_random, ignore_index=True) # append the dataset of random points
#### improvement for duration 2019-07-02 #############
#### full durations are too long for capturing the glitch signature so that the solution is to choose T90
### In this situation, a gap that is manually chosen will be gap_new = gap_m + 0.05*T100
### This function is for getting quite segments so that I concervatively choose the longest fraction of duration, in default
FractionEnergy = 1 # (100% of energy )
# note (1 - FractionEnergy)/2. is the fraction of the one-sided off T90
df_int = df_int.assign(gap=gap + (1 - FractionEnergy) / 2. * df_int['duration'])
df_int['duration'] = np.round(FractionEnergy * df_int['duration'], 3) # replace full durations (T100) with T90
# make the time segments of glitches measured
# I reserve duration +/- gap around a GPS time of a glitch
if UserDefinedDuration > 0: # accept a user defined duration
# assign this user defined duration for any types of glieches observed in a given IFO
df_int.loc[(df_int['ifo'] == IFO), ['duration']] = UserDefinedDuration
else: # otherwise, keep the duration that are determined by an event trigger generator (ETG)
pass
df_int = df_int.assign(reserved_time_start=df_int['GPStime'] - df_int['duration'] / 2. - df_int['gap'])
df_int = df_int.assign(reserved_time_end=df_int['GPStime'] + df_int['duration'] / 2. + df_int['gap'])
# data set of rondomly chosen background
df_glitch = df_int[df_int.label == 'null'] # synthetic background trigger has SNR of zero
# exclude glitches with SNR <= BGSNR from df_int as they are quiet enough to be considered as background
df_int = df_int[df_int['snr'] > BGSNR_thre]
print('all the glitches in the class: {}'.format(df_glitch.shape[0]))
# the list of values satisfied as secure
ListIndexSatisfied = []
Listtarget_timeseries_start = []
Listtarget_timeseries_end = []
Listpre_background_start = []
Listpre_background_end = []
Listfol_background_start = []
Listfol_background_end = []
Listgpstime = []
Listduration = []
ListSNR = []
Listconfi = []
ListID = []
for ind in range(len(df_glitch)):
# target glitch's peak GPS time and the duration
gpstime = df_glitch['GPStime'].iloc[ind] # a peak GPS time
duration = df_glitch['duration'].iloc[ind] # # a duration
SNR = df_glitch['snr'].iloc[ind] # snr
confi = df_glitch['confidence'].iloc[ind] # confidence level
ID = df_glitch['id'].iloc[ind] # ID
gap_individual = df_glitch['gap'].iloc[ind]
if duration > 0.02: # accept the duration of glitch is greater than 0.02 sec
pass # go to the next line
else: # if the duration of a glitch is less than 0.02 sec
continue # go to the next iteration
# the starting and ending time of the target glitch
# at this moment I assume that the target glitch is distributed evenly around the GPS time
target_timeseries_start = gpstime - duration / 2.
target_timeseries_end = target_timeseries_start + duration
# take the background
# the background is around the target glitch
# preceding background, which is a nearest neighbor segment before target_timeseries_start
# following background, which is a nearest neighbor segment after target_timeseries_end
background_duration = 64 # duration # duration of both the proceeding and following background
pre_background_start = target_timeseries_start - gap_individual - background_duration
pre_background_end = pre_background_start + background_duration # target_timeseries_start - gap_individual
fol_background_start = target_timeseries_end + gap_individual
fol_background_end = fol_background_start + background_duration # target_timeseries_end + gap_individual + background_duration
# security check of the preceding background
# case 1 is where the background is coincident with the right edge of the reserved segment
# case 2 is where the background is coincident with the left edge of the reserved segment
# case 3 is where the reserved segment is inside of the background
# case 4 is where the background is inside of the reserved segment
# it violates the security of the background estimation if at least one of the cases occurs
# i.e., pro_case1, 2, 3, and 4 should be zero
case1 = len(df_int[(df_int['reserved_time_end'] > target_timeseries_start) & (df_int['reserved_time_end'] < target_timeseries_end)]) # case 1
case2 = len(df_int[(df_int['reserved_time_start'] > target_timeseries_start) & (df_int['reserved_time_start'] < target_timeseries_end)]) # case 2
case3 = len(df_int[(df_int['reserved_time_start'] > target_timeseries_start) & (df_int['reserved_time_end'] < target_timeseries_end)]) # case 3
case4 = len(df_int[(df_int['reserved_time_start'] < target_timeseries_start) & (df_int['reserved_time_end'] > target_timeseries_end)]) # case 4
if case1 == 0 and case2 == 0 and case3 == 0 and case4 == 0:
try:
# smallest time gap between the start of a target reserved time and the nearest reserved time of others that are before the target
neareset_gap_pre = (target_timeseries_start - df_int[df_int['reserved_time_end'] < target_timeseries_start]['reserved_time_end']).sort_values()[:1].values[0]
if neareset_gap_pre > background_duration - 5: # # if the smallest gap is larger than the background duration
neareset_gap_pre = background_duration - 5 # replace it
else:
pass
target_timeseries_start = target_timeseries_start - neareset_gap_pre # replace
except IndexError: # there is no nearest glitch that satisfies the above condition
pass
try:
# smallest time gap between the end of a target reserved time and the nearest reserved time of others that are after the target
neareset_gap_fol = (df_int[df_int['reserved_time_start'] > target_timeseries_end]['reserved_time_start'] - target_timeseries_end).sort_values()[:1].values[0]
if neareset_gap_fol > background_duration - 5: # # if the smallest gap is larger than the background duration
neareset_gap_fol = background_duration -5 # replace it
else:
pass
target_timeseries_end = target_timeseries_end + neareset_gap_fol # replace
except IndexError: # there is no nearest glitch that satisfies the above condition
pass
# update duration
duration = round(target_timeseries_end - target_timeseries_start, 3)
# #print('{:.1f}, {:.1f}, {:.1f}'.format(neareset_gap_pre, neareset_gap_fol, duration))
print('{:.1f}, {:.1f}, {:.1f}'.format(pre_background_end - pre_background_start, fol_background_end - fol_background_start, duration))
ListIndexSatisfied.append(ind) # append the index satisfied as secure
Listtarget_timeseries_start.append(target_timeseries_start)
Listtarget_timeseries_end.append(target_timeseries_end)
Listpre_background_start.append(pre_background_start)
Listpre_background_end.append(pre_background_end)
Listfol_background_start.append(fol_background_start)
Listfol_background_end.append(fol_background_end)
Listgpstime.append(gpstime)
Listduration.append(duration)
ListSNR.append(SNR)
Listconfi.append(confi)
ListID.append(ID)
else:
continue
# zipped list of segments
Listsegments = list(
zip(ListIndexSatisfied, Listtarget_timeseries_start, Listtarget_timeseries_end, Listpre_background_start,
Listpre_background_end, Listfol_background_start, Listfol_background_end, Listgpstime, Listduration,
ListSNR, Listconfi, ListID))
return Listsegments
# ==== save ====
[docs]def clean_duration_for_asd(duration):
'''
description:
This function is used for clean the dicimal points in value of duration to avoid the error shown in ASD estimator in gwpy
:param duration: duration in sec
:return: duration: cleaned duration in sec
'''
fourth_decimal_num_of_duration = (duration * 1000 - floor(duration * 1000)) * 10 # fourth decimal number
duration = duration - fourth_decimal_num_of_duration * 0.0001 # truncate at the fourth decimal number
# revise the duration to make an error in whitening process not not to occur
# ValueError occurs in the whitening when the lengths of whitened data, whitening data and the filter are different
third_decimal_num_of_duration = round((round(duration, 3) - round(duration, 2)) * 1000)
if third_decimal_num_of_duration % 2 == 1: # odd number at the third decimal point
duration = duration - 0.001 # make the duration with an even number at the third decimal point
elif third_decimal_num_of_duration % 2 == 0: # even number at the third decimal point
pass # keep the original duration as it is
duration = round(duration, 3) # round it up at 3rd decimal point as the above three lines creates small number below the 3rd decimal point
return duration
[docs]def Multiprocess_whitening_for_burn_in(full_timeseries, target_timeseries_start, target_timeseries_end, array_dummy_duration):
'''
description:
This is used for normalizing the spectrum of each trial in the on-source window of random time stamps created with FindBGlistBurnIn()
iterate each trial for each dummy on
1. from the time series of a single channel
2. calculate how many trials are available per dummy on-source window within the extended on-source window
3. iterate trials per dummy on-source window
USAGE: list_whitened_fft, sample_rate, DURATION, list_num_trial_used = Multiprocess_whitening_for_burn_in(full_timeseries, target_timeseries_start, target_timeseries_end, array_dummy_duration)
:param full_timeseries: time series comprising target and BGs
:param target_timeseries_start: a start time of a target segment
:param target_timeseries_end: an end time of a target segment
:param array_dummy_duration: numpy array of dummy on-source windows
:return: list_whitened_fft: list of numpy arrays of the nomalized spectrum where each element of this list is the normalized spectrum for each trial with a given dummy on-source window.
These spectrums are concatenated to a vector from left to right, e.g, np.array([sp0_try0, sp1_try0, ..., sp0_try1, sp0_try1, .... ])
Hence, this list is [ (sp for dummy 0), (sp for dummy 1), (....), ....]
sample_rate: sampling rate of this channel
DURATION: a duration of a target segment
list_num_trial_used: a list of trials per dummy on-source window. Note the number of trials per dummy on-source vary becuase of the limited length of the extended total on-source window.
The longer the dummy on-source, the fewer the trials are available
'''
sample_rate = full_timeseries.sample_rate.value
DURATION = target_timeseries_end - target_timeseries_start
fourth_decimal_num_of_duration = (DURATION * 1000 - floor(DURATION * 1000)) * 10 # fourth decimal number
DURATION = DURATION - fourth_decimal_num_of_duration * 0.0001 # truncate at the fourth decimal number
# revise the duration to make an error in whitening process not not to occur
# ValueError occurs in the whitening when the lengths of whitened data, whitening data and the filter are different
third_decimal_num_of_duration = round((round(DURATION, 3) - round(DURATION, 2)) * 1000)
if third_decimal_num_of_duration % 2 == 1: # odd number at the third decimal point
DURATION = DURATION - 0.001 # make the duration with an even number at the third decimal point
elif third_decimal_num_of_duration % 2 == 0: # even number at the third decimal point
pass # keep the original duration as it is
DURATION = round(DURATION, 3) # round it up at 3rd decimal point as the above three lines creates small number below the 3rd decimal point
# get the on-source time series
on_source = full_timeseries.crop(start=target_timeseries_start, end=target_timeseries_end)
on_source_start = on_source.span[0]
on_source_end = on_source.span[1]
full_timeseries_start = full_timeseries.span[0]
full_timeseries_end = full_timeseries.span[1]
# average background ASD
length_for_ASD = 128 # in sec
timeseries_for_asd = full_timeseries.crop(end=full_timeseries_start+length_for_ASD)
list_whitened_fft = []
list_num_trial_used = []
for dummy_duration in array_dummy_duration: # iterate dummy on-source window
dummy_duration = clean_duration_for_asd(dummy_duration)
# number of BG trials
iterate = int(DURATION / dummy_duration)
## --------cropping from the starting time------
# take some fraction of trials
list_index_trials = select_some_trials(iterate, maximum_iterate=10)
crop_start_list = [on_source_start + i * dummy_duration for i in list_index_trials] # a list of starting times cropped segments
crop_end_list = [crop_start + dummy_duration for crop_start in crop_start_list] # a list of ending times of cropped segments
# avoid the cropping warning
crop_end_list = [crop_end if crop_end < on_source_end else on_source_end for crop_end in crop_end_list] # if end time is replaced with the end of the BG segments if it is larger
# note: fft_PBG = pre_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs().value # crop and fft of it
ave_background_asd = timeseries_for_asd.asd(dummy_duration, 0.5 * dummy_duration, method='median').value
if (timeseries_for_asd.value == 0).all() is True: # if the time series is all zero
whitened_fft_ind_list = [np.zeros(ave_background_asd.shape[0]) for crop_start, crop_end in zip(crop_start_list, crop_end_list)]
elif ave_background_asd[ave_background_asd != 0].shape[0] == 0: # if the time series is all zero
whitened_fft_ind_list = [np.zeros(ave_background_asd.shape[0]) for crop_start, crop_end in zip(crop_start_list, crop_end_list)]
else:
# replace value of elements equal to zero with non-zero nearest neighbor
ave_background_asd_new = ave_background_asd.copy()
zero_indecies = np.where(ave_background_asd == 0)[0]
nonzero_indecies = np.where(ave_background_asd != 0)[0]
if zero_indecies.shape[0] == 0: # if everything is non-zero
pass
# replace value of elements equal to zero with non-zero nearest neighbor
else: # if some of them are zero
for zero_ind in zero_indecies: #
# nearest index with non-zero value
k = np.argsort(np.abs((zero_ind - nonzero_indecies)))[0]
ave_background_asd_new[zero_ind] = ave_background_asd[nonzero_indecies[k]]
# calculate each whitened FFT
whitened_fft_ind_list = [on_source.crop(start=crop_start, end=crop_end).asd(dummy_duration, 0.5 * dummy_duration, method='median').value / ave_background_asd_new for crop_start, crop_end in zip(crop_start_list, crop_end_list)]
del crop_start_list, crop_end_list # delete redundant lists
whitened_fft = np.concatenate(whitened_fft_ind_list, axis=None)
del whitened_fft_ind_list # delete redundant a list
# fill
list_whitened_fft.append(whitened_fft)
# number of trials used
num_trial_used = len(list_index_trials)
# fill
list_num_trial_used.append(num_trial_used)
return list_whitened_fft, sample_rate, DURATION, list_num_trial_used
[docs]def SaveDummyTargetHDF5_OFFLINE_burn_in(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, PlusHOFT='False', trial_duration_sample=20):
'''
description:
Save the normalized spectrum for every channels for every glitches, with the use of Multiprocess_whitening_for_burn_in()
1. iterate all the samples
2. whiten the on-source window for every channels
4. save the whitened spectrum as a HDF5 file
each group in the HDF5 is for a single channel and each of datasets per group has the normalized spectrum for a dummy on-source window
USAGE: SaveTargetAndBackGroundHDF5_OFFLINE_burn_in(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, PlusHOFT='False')
:param Listsegments: a list of segment parameters
:param re_sfchs: a list of safe channels
:param outputpath: a directory of an output file
:param outputfilename: a name of an output file
:param number_process: a number of processes in parallel
:param PlusHOFT: whether to get data of hoft, {'True' or 'False'}, 'False' in default
:param trial_duration_sample: a number of dummy on-source
:return None
'''
# check if the output directory exists
if os.path.isdir(outputpath) is False:
print("The directory for the output HDF5 file does not exist: path = {}".format(outputpath))
print("Please specify the existing directory for it.")
sys.exit()
# the list of the safe channels
channels = ['{0}:{1}'.format(IFO, ch) for ch in re_sfchs]
# the path and the file name
outputPathFileName = os.path.join(outputpath, outputfilename)
# make an output file
try:
f = h5py.File(outputPathFileName, 'w-')
f.close()
except IOError:
print('The file already exists in {}'.format(outputPathFileName))
print('Please choose a different file name in a configuration file or delete this file if you want to overwrite it')
sys.exit()
# remove the astro cashe
if os.path.isfile('~/.astro/cashe/downlowd/py3/lock') is True:
os.remove('~/.astro/cashe/downlowd/py3/lock')
# open a HDF5 file in writing mode
with h5py.File(outputPathFileName, 'r+') as f:
for IndexSatisfied, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end, gpstime, duration, SNR, confi, ID in Listsegments:
t0 = time.time()
try: # deal with time series in bad state
if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background
full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background
full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background
full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
else:
pass
channels_used = channels
if PlusHOFT == 'True': # get the data of hoft
if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background
full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background
full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background
full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
else:
pass
full_timeseriesDict.append(full_timeseriesDict_hoft) # append
del full_timeseriesDict_hoft # delete the hoft object
channels_used = np.append(channels, '{}:GDS-CALIB_STRAIN'.format(IFO))
# =====================================================
except IndexError:
print('Bad data, go to a next GPS')
continue # go to a next GPS time
except RuntimeError:
print('RuntimeError, go to a next GPS')
continue
except IOError: # error when getting time series
print('IOError, go to a next GPS')
continue
except ValueError: # ValueError: Failed to read /hdfs/frames/O3/raw/H1/H-H1_R-12384/H-H1_R-1238489600-64.gwf: Unable to map from the stream class id (0) to internal class id at position: 18446744073709551601 [streamref: class: 0 instance: 0 length: 0 ]
print('ValueError, go to a next GPS')
continue
g = f.create_group('gps{:05d}'.format(IndexSatisfied))
g.attrs['GPS'] = gpstime # a peak GPS time
g.attrs['duration'] = duration
g.attrs['snr'] = SNR # snr
g.attrs['confidence'] = confi # confidence
g.attrs['id'] = ID
g.attrs['ifo'] = channels[0].split(':')[0]
# total on-source window
on_source_duration = target_timeseries_end - target_timeseries_start
# duration on trial on-source window
#------------------------------------------------------------+
# randomly choose dummy on-source windows uniform in log10 |
#------------------------------------------------------------+
duration_min = 0.02 # the minimum duration is set to 0.02 s. This is ok as in this program analysis glitches with duration gerater than 0.02 s
duration_max = 15
if duration_max > on_source_duration:
duration_max = on_source_duration
array_dummy_duration = np.round(10 ** np.random.uniform(np.log10(duration_min), np.log10(duration_max), trial_duration_sample), 3)
try:
ch_labels = range(len(channels_used)) # a list of channel labels
list_array_dummy_duration = []
List_list_whitened_fft = [] # a list of whitened fft of target
List_sample_rate = [] # a list of sample rates
List_DURATION = []
list_list_num_trial_used = []
# whitened target and BG segments with multi processes
with concurrent.futures.ProcessPoolExecutor(max_workers=number_process) as executor:
for list_whitened_fft, sample_rate, DURATION, list_num_trial_used in executor.map(Multiprocess_whitening_for_burn_in,
full_timeseriesDict.values(),
itrtls.repeat(target_timeseries_start),
itrtls.repeat(target_timeseries_end),
itrtls.repeat(array_dummy_duration)
):
List_list_whitened_fft.append(list_whitened_fft) # fill
List_sample_rate.append(sample_rate) # fill
List_DURATION.append(DURATION) # fill
list_list_num_trial_used.append(list_num_trial_used) # fill
for ch_label in ch_labels: # iterate through channel labels
sub_g = g.create_group("ch{:05d}".format(ch_label))
sub_g.attrs['SamplingRate'] = List_sample_rate[ch_label] # target
sub_g.attrs['channel'] = channels_used[ch_label].split(':')[1] # target
for dummy_duration_index in range(len(array_dummy_duration)):
dset = sub_g.create_dataset("dummy{:05d}".format(dummy_duration_index), data=List_list_whitened_fft[ch_label][dummy_duration_index], compression="gzip", compression_opts=9) # target
dset.attrs['SamplingRate'] = List_sample_rate[ch_label] # target
dset.attrs['duration'] = array_dummy_duration[dummy_duration_index] # target
dset.attrs['num_trial'] = list_list_num_trial_used[ch_label][dummy_duration_index] # target
except ValueError:
del f['gps{:05d}'.format(IndexSatisfied)]
print('*** delete group gps{:05d} due to ASD estimation error ***'.format(IndexSatisfied))
continue # go to a next GPS time
except IOError: # IOError: Driver write request failed
del f['gps{:05d}'.format(IndexSatisfied)]
print('*** delete group gps{:05d} due to memory error ***'.format(IndexSatisfied))
continue # go to a next GPS time
print('Complete saving a sample as gps{:05d}'.format(IndexSatisfied))
t1 = time.time()
print('Time taken: {} s'.format(t1 - t0))
# ====== save for multi-frequency bands search =====
[docs]def BG_upper_threshold_single_channel_given_freqband(list_dummy_duration, list_whitened_fft, sampling_rate, list_num_trial_used, sigma, LowerCutOffFreq='None', UpperCutOffFreq='None'):
'''
description:
For a single channel per glitch, this function calculates a list of the background upper threshold across dummy on-source windows
USAGE: list_bg_upper_threshold = BG_upper_threshold_single_channel_given_freqband(list_dummy_duration, list_whitened_fft, sampling_rate, list_num_trial_used, sigma, LowerCutOffFreq='None', UpperCutOffFreq='None')
:param list_dummy_duration: a list of dummy on-source window
:param list_whitened_fft: a list of the normalized spectrums where each element is a spectrum for a given dummy on-source window
:param sampling_rate: sampling rate of a channel
:param list_num_trial_used: a list of trials of dummy on-source window within the total on-source window
:param sigma: an integer to determine the upper bound of the off-source window
:param LowerCutOffFreq: a lower cutoff frequency
:param UpperCutOffFreq: an upper cutoff frequency
:return: Count: importance
'''
list_bg_upper_threshold = np.array([])
for dummy_duration_index, dummy_duration in enumerate(list_dummy_duration, 0):
sp = list_whitened_fft[dummy_duration_index]
num_trial_used = list_num_trial_used[dummy_duration_index]
try: # test if it is an array
sp[0]
except ValueError: # if it is None
sp = 0 # assign an integer
#=======================
if type(sp) != int:
sp = sp.reshape(int(num_trial_used), int(len(sp)/num_trial_used)) # reshape it
else:
sp = np.array([[0]]) # a trick to be used in the line, if np.isnan(sp[0]) != True :
#=======================
#++++++ for the case where there is no data measured at a channel +++++++++++++++++++
#++++++ That channel has no contribution to h(t) so that we can define Count = 0 +++++++++++
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
if np.isnan(sp[:,0]).any() != True : # if sp has data measured
# low, high, or band pass filter if we know which frequency band is important
if LowerCutOffFreq != 'None' and UpperCutOffFreq == 'None': # high-pass filter
## pT, pP, and pF should be zero if LowerCutOffFreq is greater than the half of the sampling rate as there is no data. However pT would be non-zero due to the bad resolution (a very short giltch)
if sampling_rate / 2 <= LowerCutOffFreq:
sp = np.array([[0]])
else:
LowerCutOffElement = int(LowerCutOffFreq * dummy_duration)
if LowerCutOffElement <= sp.shape[-1]: # if LowCutOffElement is within the length of the vector
sp = sp[:, LowerCutOffElement:]
else: # if LowerCutOffElement is greater than the length of the vector
sp = np.array([[0]])
elif LowerCutOffFreq == 'None' and UpperCutOffFreq != 'None': # low-pass filter
# even if UpperCutOffElement is greater than the length of the vector is works fine
UpperCutOffElement = int(UpperCutOffFreq * dummy_duration)
sp = sp[:, :UpperCutOffElement]
elif LowerCutOffFreq != 'None' and UpperCutOffFreq != 'None': # band pass filter
## pT, pP, and pF should be zero if LowerCutOffFreq is greater than the half of the sampling rate as there is no data. However pT would be non-zero due to the bad resolution (a very short giltch)
if sampling_rate / 2 <= LowerCutOffFreq:
sp = np.array([[0]])
else:
LowerCutOffElement = int(LowerCutOffFreq * dummy_duration)
UpperCutOffElement = int(UpperCutOffFreq * dummy_duration)
if LowerCutOffElement <= sp.shape[-1]: # if LowCutOffElement is within the length of the vector
sp = sp[:, LowerCutOffElement:UpperCutOffElement]
else: # if LowCutOffElement is greater than the length of the vector
sp = np.array([[0]])
else:
pass
if sp[0,0] == 0:
GausCeiling = 0
else:
GausCeiling = sp.mean() + sigma * sp.std(ddof=1) # gaussian noise ceiling
else:
GausCeiling = 0 # manually put zero because this channel has no contribution at this glitch ID
list_bg_upper_threshold = np.append(list_bg_upper_threshold, GausCeiling)
return list_bg_upper_threshold
[docs]def BG_upper_threshold_single_channel_multiband(list_dummy_duration, list_whitened_fft, sampling_rate, list_num_trial_used, sigma):
'''
description:
calculate values of the background upper threshold per dummy on-source window and per frequency band
USAGE: MatBGUpperThresh = BG_upper_threshold_single_channel_multiband(list_dummy_duration, list_whitened_fft, sampling_rate, list_num_trial_used, sigma)
:param list_dummy_duration: a list of dummy on-source window
:param list_whitened_fft: a list of the normalized spectrums where each element is a spectrum for a given dummy on-source window
:param sampling_rate: sampling rate of a channel
:param list_num_trial_used: a list of trials of dummy on-source window within the total on-source window
:param sigma: an integer to determine the upper bound of the off-source window
:return: MatBGUpperThresh: values of the background upper threshold, numpy array, where the frequency bands are rows from the top to bottom, the dummy on-source windows are in columns from left to right
'''
# iterate different frequency bands in the list
MatBGUpperThresh = np.array([])
for lower_freq_cut, upper_freq_cut in Const.freq_bands:
list_bg_upper_threshold = BG_upper_threshold_single_channel_given_freqband(list_dummy_duration, list_whitened_fft, sampling_rate,
list_num_trial_used, sigma, LowerCutOffFreq=lower_freq_cut, UpperCutOffFreq=upper_freq_cut)
list_bg_upper_threshold = list_bg_upper_threshold.reshape(1, list_bg_upper_threshold.shape[-1])
MatBGUpperThresh = np.vstack([MatBGUpperThresh, list_bg_upper_threshold]) if MatBGUpperThresh.size else list_bg_upper_threshold
return MatBGUpperThresh
[docs]def CreateBGUpperThresh_single_channel_multiband(full_timeseries, target_timeseries_start, target_timeseries_end, array_dummy_duration, sigma):
'''
discription:
1. make list_whitened_fft: list of numpy arrays of the nomalized spectrum where each element of this list is the normalized spectrum for each trial with a given dummy on-source window.
These spectrums are concatenated to a vector from left to right, e.g, np.array([sp0_try0, sp1_try0, ..., sp0_try1, sp0_try1, .... ])
Hence, this list is [ (sp for dummy 0), (sp for dummy 1), (....), ....]
sample_rate: sampling rate of this channel
DURATION: a duration of a target segment
list_num_trial_used: a list of trials per dummy on-source window. Note the number of trials per dummy on-source vary becuase of the limited length of the extended total on-source window.
The longer the dummy on-source, the fewer the trials are available
2. calculate values of the background upper threshold per dummy on-source window and per frequency band
USAGE: MatBGUpperThresh, sample_rate = CreateBGUpperThresh_single_channel_multiband(full_timeseries, target_timeseries_start, target_timeseries_end, array_dummy_duration, sigma)
:param full_timeseries: the full time series in gwpy object including on- and off source windows
:param target_timeseries_start: the start time of the on-source window
:param target_timeseries_end: the end time of the on-source window
:param array_dummy_duration: numpy array of dummy on-source windows
:param sigma: an interger to calculate the value of importance
:return:
MatBGUpperThresh: values of the background upper threshold, numpy array, where the frequency bands are rows from the top to bottom, the dummy on-source windows are in columns from left to right
sample_rate: a sampling rate of a single channel
'''
# calculate the whitened FFT of the on- and off-source windows
list_whitened_fft, sample_rate, DURATION, list_num_trial_used = Multiprocess_whitening_for_burn_in(full_timeseries, target_timeseries_start, target_timeseries_end, array_dummy_duration)
# calculate the importance
MatBGUpperThresh = BG_upper_threshold_single_channel_multiband(array_dummy_duration, list_whitened_fft, sample_rate, list_num_trial_used, sigma)
return MatBGUpperThresh, sample_rate
[docs]def CreateAllChannels_BGUpperThresh_multband(Listsegment, IFO, re_sfchs, number_process, PlusHOFT, sigma, duration_max, trial_duration_sample):
'''
description:
1. use a single glitch time
2. query timeseries of all the channels around a glitch
3. calculate values of the background upper threshold per dummy on-source window per frequency band
4. iterate through channels
USAGE: IndexSatisfied, list_mat_BG_upper_thresh, array_dummy_duration, list_sample_rates, re_sfchs, gpstime, duration, SNR, confi, ID = CreateAllChannels_BGUpperThresh_multband(Listsegment, IFO, re_sfchs, number_process, PlusHOFT, sigma, duration_max = 15, trial_duration_sample = 20)
:param Listsegment: a list of segment parameters
:param IFO
:param re_sfchs: a list of safe channels
:param number_process: a number of processes in parallel
:param PlusHOFT: whether to get data of hoft, {'True' or 'False'}
:param sigma: an integer to be used for calculating values of importance
:param maximum value of dummy on-sourc window in sec
:param a number of dummy on-source windows within the total on-source window
:return:
IndexSatisfied: glitch index
list_mat_BG_upper_thresh: a list of matrices per channel where element of a matrix is a value of the background upper threshold, numpy array, where the frequency bands are rows from the top to bottom, the dummy on-source windows are in columns from left to right
array_dummy_duration: numpy array of the dummy on-source windows
list_sample_rates: a list of sampling rates of channels, numpy array
re_sfchs: list of channels without "IFO:" at the beginning
gpstime: a gps time
duration: a value of duration
SNR: signal to noise ratio in the h(t)
conf: a confidence level of classification of a glitch, provided by Gravity Spy. Otherwise None
ID: a glitch ID, provided by Gravity Spy in usual
'''
channels = ['{0}:{1}'.format(IFO, ch) for ch in re_sfchs]
IndexSatisfied, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end, gpstime, duration, SNR, confi, ID = Listsegment
t0 = time.time()
# extracting the target data and the proceeding and following backgrounds
try: # deal with time series in bad state
if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background
full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background
full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background
full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
else:
pass
if PlusHOFT == 'True': # get the data of hoft
if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background
full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background
full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background
full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
else:
pass
full_timeseriesDict.append(full_timeseriesDict_hoft) # append
del full_timeseriesDict_hoft # delete the hoft object
re_sfchs = np.append(re_sfchs, 'GDS-CALIB_STRAIN')
except IndexError as err:
print("IndexError: {}".format(err.message))
# IndexError: After reading object of type: FrAdcData stream is no longer in good state. ( Frmae: 0 ChannelType: L1:ASC-X_TR_A_NSUM_OUT_DQ offset: 932092099)
return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
except RuntimeError as err:
print("RuntimeError: {}".format(err.message))
return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
except IOError as err: # error when getting time series
print('IOError: {}'.format(err.message))
return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
except ValueError: # ValueError: Failed to read /hdfs/frames/O3/raw/H1/H-H1_R-12384/H-H1_R-1238489600-64.gwf: Unable to map from the stream class id (0) to internal class id at position: 18446744073709551601 [streamref: class: 0 instance: 0 length: 0 ]
print('ValueError')
return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
try:
# total on-source window
on_source_duration = target_timeseries_end - target_timeseries_start
# duration on trial on-source window
# -------------------------------------------+
# randomly choose dummy on-source windows |
# -------------------------------------------+
duration_min = 0.02 # the minimum duration is set to 0.02 s. This is ok as in this program analysis glitches with duration gerater than 0.02 s
if duration_max > on_source_duration:
duration_max = on_source_duration
array_dummy_duration = np.round(10 ** np.random.uniform(np.log10(duration_min), np.log10(duration_max), trial_duration_sample), 3)
list_mat_BG_upper_thresh = [] # a list of rho
list_sample_rates = np.array([])
# whitened target and BG segments with multi processes
with concurrent.futures.ProcessPoolExecutor(max_workers=number_process) as executor:
for MatBGUpperThresh, sample_rate in executor.map(CreateBGUpperThresh_single_channel_multiband,
full_timeseriesDict.values(),
itrtls.repeat(target_timeseries_start),
itrtls.repeat(target_timeseries_end),
itrtls.repeat(array_dummy_duration),
itrtls.repeat(sigma)
):
# fill
list_mat_BG_upper_thresh.append(MatBGUpperThresh)
list_sample_rates = np.append(list_sample_rates, sample_rate)
except ValueError:
print('*** delete group gps{:05d} due to ASD estimation error ***'.format(IndexSatisfied))
return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
except IOError: # IOError: Driver write request failed
print('*** delete group gps{:05d} due to memory error ***'.format(IndexSatisfied))
return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
t1 = time.time()
print('Time taken: {} s'.format(t1 - t0))
return IndexSatisfied, list_mat_BG_upper_thresh, array_dummy_duration, list_sample_rates, re_sfchs, gpstime, duration, SNR, confi, ID # pass the parameters
[docs]def SaveUppperThreshodBG_multiband_OFFLINE_burn_in(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, sigma, PlusHOFT='False', duration_max=15, trial_duration_sample=20):
'''
description:
THIS IS USED FOR "OFFLINE" MODE
1. iterate glitch samples
2. get values of the background upper threshold per dummy on-source window per frequency band for each of channels
4. save the values
Note this depends on
USAGE: SaveUppperThreshodBG_multiband_OFFLINE(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, sigma, PlusHOFT='False', duration_max = 15, trial_duration_sample = 20)
:param Listsegments: a list of segment parameters
:param re_sfchs: a list of safe channels
:param outputpath: a directory of an output file
:param outputfilename: a name of an output file
:param number_process: a number of processes in parallel
:param PlusHOFT: whether to get data of hoft, {'True' or 'False'}, 'False' in default
:param sigma: an integer to determine the upper bound of the off-source window
:param duration_max: a maximum value of length of dummy on-source window in sec
:param trial_duration_sample: a number of dummy on-source window within the total on-source window
:return None
'''
# the path and the file name
outputPathFileName = os.path.join(outputpath, outputfilename)
# make an output file
try:
f = h5py.File(outputPathFileName, 'w-')
f.close()
except IOError:
print('The file already exists in {}'.format(outputPathFileName))
print('Please choose a different file name in the configuration file or delete this file if you want to overwrite it')
sys.exit()
# remove the astro cashe
if os.path.isfile('~/.astro/cashe/downlowd/py3/lock') is True:
os.remove('~/.astro/cashe/downlowd/py3/lock')
# open a HDF5 file in writing mode
with h5py.File(outputPathFileName, 'r+') as f:
for Listsegment in Listsegments:
IndexSatisfied, list_mat_BG_upper_thresh, array_dummy_duration, list_sample_rates, re_sfchs_used, gpstime, duration, SNR, confi, ID = CreateAllChannels_BGUpperThresh_multband(Listsegment, IFO, re_sfchs, number_process, PlusHOFT, sigma, duration_max, trial_duration_sample)
if type(duration) == int: # if some error occurs, List_rho is 0, go to the next iteration
continue
else:
g = f.create_group('gps{:05d}'.format(IndexSatisfied))
g.attrs['GPS'] = gpstime # a peak GPS time
g.attrs['duration'] = duration
g.attrs['snr'] = SNR # snr
g.attrs['confidence'] = confi # confidence
g.attrs['id'] = ID
g.attrs['ifo'] = IFO
g.create_dataset("dummy_durations", data=array_dummy_duration, compression="gzip", compression_opts=9) # matrix of frequency bnads
freq_bands = np.array(Const.freq_bands).tolist()
for i in range(len(freq_bands)):
for j in range(len(freq_bands[0])):
freq_bands[i][j] = freq_bands[i][j].encode('utf8')
g.create_dataset("freq_bands", data=freq_bands, compression="gzip", compression_opts=9) # matrix of frequency bnads
for ch_label in range(len(re_sfchs_used)): # iterate through channel labels
dset = g.create_dataset("ch{:05d}".format(ch_label), data=list_mat_BG_upper_thresh[ch_label], compression="gzip", compression_opts=9)
dset.attrs['SamplingRate'] = list_sample_rates[ch_label] # target
dset.attrs['channel'] = re_sfchs_used[ch_label].encode('utf8') # target
#--------------------------
# post-process for burn-in
#--------------------------
[docs]def make_bg_up_thesh_interpolate_hdf5(input_dir, input_hdf5_file):
'''
description:
make a dictionry of values of the background upper threshold across dummy on-source window and frequency bands per channel
USAGE: list_all_dummy_duration_sorted, mat_ch_sorted_dict = make_bg_up_thesh_interpolate_hdf5(input_dir, input_hdf5_file)
:param input_dir: input directory
:param input_hdf5_file: name of a HDF5 file
:return: list_all_dummy_duration_sorted: ascending list of dummy on-source windows
mat_ch_sorted_dict: a dictionary in which each of the key contains array of values the he background upper threshold per dummy on-source window per frequency band
where frequency bands are in row from top to bottom and dummy on-source windows are in columns from left to right
'''
# path to a HDF5 file which contains all the values of the background upper threshold across channels and frequency bands
path_input_hdf5 = os.path.join(input_dir, input_hdf5_file)
if os.path.isfile(path_input_hdf5) is False:
print("No input background upper threshold h5py file called {}".format(path_input_hdf5))
print("Exit this program")
sys.exit()
# open the input hdf5 file with reading mode
f = h5py.File(path_input_hdf5, 'r')
# list of glitches
list_g = list(f.keys())
# --------------------------+
# get the list of channels |
# --------------------------+
list_channels = np.array([])
g = f[list_g[0]]
for dset_ind in g.keys():
if dset_ind[:2] != "ch":
continue
channel = g[dset_ind].attrs["channel"].decode('utf8')
list_channels = np.append(list_channels, channel)
#-------------------------+
# get the frequency bands |
#-------------------------+
list_freq_bands = g['freq_bands'][()]
#--------------------------------------------------------------------------------------------------------------+
# get the background upper thresholds for all the glitches and all the channels across all the frequency bands |
#--------------------------------------------------------------------------------------------------------------+
mat_ch_dict = {} # keys are channels
list_all_dummy_duration = np.array([]) # list of all the dummy durations
# several dummy durations are chosen per glitch
for gps_ind in f.keys(): # iterate through glitches
g = f[gps_ind]
list_dummy_duration = g['dummy_durations'][()]
list_all_dummy_duration = np.append(list_all_dummy_duration, list_dummy_duration)
for ch_ind in range(len(list_channels)): # iterate through channels
# extract the matrix of the back ground upper threshold per channel per glitch
# each element in this matrix is the BG upper threshold value for a given frequency band and a given dummy duration
# frequency bands are listed in row from 0 to 7, for example
# dummy durations are in columns
bg_up_mat = g["ch{:05d}".format(ch_ind)][()]
if gps_ind == list_g[0]: # if the iteration is the first glitch
mat_ch_dict[list_channels[ch_ind]] = bg_up_mat # fill the matrix to the dictionary
else: # otherwise
# stack to the previous matrix
mat_ch_dict[list_channels[ch_ind]] = np.hstack([mat_ch_dict[list_channels[ch_ind]], bg_up_mat]) if mat_ch_dict[list_channels[ch_ind]].size else bg_up_mat
#-----------------------------------------------------------------------+
# short the upper threshold ascendingly in the order of dummy durations |
#-----------------------------------------------------------------------+
sorted_index = np.argsort(list_all_dummy_duration) # sort index ascending
list_all_dummy_duration_sorted = list_all_dummy_duration[sorted_index] # sorted dummy durations
mat_ch_sorted_dict = {}
for channel in mat_ch_dict.keys(): # iterate through channels
mat_ch_sorted_dict[channel] = mat_ch_dict[channel][:, sorted_index] # sort the values and fill
print("Finished extracting values of the background upper threshold")
return list_all_dummy_duration_sorted, mat_ch_sorted_dict
[docs]def save_interpolate_bg_upper_thres_hdf5(list_all_dummy_duration_sorted, mat_ch_sorted_dict, output_dir, output_hdf5_file, med_abs_sigma=6, poly_degree=10):
'''
description:
1. fit the polynomial function against the background upper threshold as a function of dummy on-source window per freq band per channel
2. save the polynomial parametes to a HDF5 file
USAGE: save_interpolate_bg_upper_thres_hdf5(list_all_dummy_duration_sorted, mat_ch_sorted_dict, output_dir, output_hdf5_file, med_abs_sigma=6, poly_degree=10)
:param list_all_dummy_duration_sorted: ascending list of dummy on-source windows
:param mat_ch_sorted_dict: a dictionary in which each of the key contains array of values the he background upper threshold per dummy on-source window per frequency band
where frequency bands are in row from top to bottom and dummy on-source windows are in columns from left to right
:param output_dir: output directory
:param output_hdf5_file: otuput file name
:param med_abs_sigma: a interger number of median absolute error to remove outliers for fitting
:param poly_degree: polynomial degree
:return: None
'''
path_output_hdf5 = os.path.join(output_dir, output_hdf5_file)
try:
f = h5py.File(path_output_hdf5, 'w-')
f.close()
except IOError:
print('The file already exists in {}'.format(path_output_hdf5))
os.system('rm {}'.format(path_output_hdf5))
# open a HDF5 file in writing mode
with h5py.File(path_output_hdf5, 'a') as f:
for channel in mat_ch_sorted_dict.keys(): # iterate through channels
print(channel)
g = f.create_group(channel)
bg_up_mat = mat_ch_sorted_dict[channel]
for band_ind in range(bg_up_mat.shape[0]): # iterate through frequency bandas
print('freq band {}'.format(band_ind))
# remove nan samples
bg_up_bd_nonnan_ind = np.where(np.isnan(bg_up_mat[band_ind, :]) == False)[0]
sample_x = list_all_dummy_duration_sorted[bg_up_bd_nonnan_ind]
sample_y = bg_up_mat[band_ind, :][bg_up_bd_nonnan_ind]
# remove extreme large outliers
median = np.median(sample_y)
med_abs_dev = stats.median_absolute_deviation(sample_y)
if med_abs_dev == 0: # if all the values are zero, med_abs_dev will be zero
med_abs_dev = 1
non_ext_outliner_ind = \
np.where((median - med_abs_sigma * med_abs_dev < sample_y) & (sample_y < median + med_abs_sigma * med_abs_dev))[0]
sample_x = sample_x[non_ext_outliner_ind]
sample_y = sample_y[non_ext_outliner_ind]
# fit with a polynomial
z = np.polyfit(sample_x, sample_y, poly_degree)
dset = g.create_dataset("BP{band_ind:d}".format(band_ind=int(band_ind)), data=z, compression="gzip", compression_opts=9)
dset.attrs['channel'] = channel.encode('utf8')
dset.attrs['freqband'] = int(band_ind)
print("Finished extrapolate values of the background upper threshold")
print("Saved as {}".format(path_output_hdf5))
return None
#---------------
# for target sample
#---------------
[docs]def FindglitchlistextendBG(df, Epoch_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UpperDurationThresh, LowerDurationThresh, UserDefinedDuration, gap, position_duration_bfr_centr, TriggerPeakFreqLowerCutoff=0, TriggerPeakFreqUpperCutoff=8192, targetUpperSNR_thre=np.inf):
'''
description:
1. glitch data set (.csv file)
2. get the data about the target glitch class
3. get the subset of the target glitches based on SNR and confidence level threshold a user defines
4. the preceeding and following BGs are 64 sec-long for every samples regardless the overlapping with any other glitches
5. return the info of the accepted glitches
USAGE: Listsegments = FindglitchlistextendBG(df, Epochstart, Epochend, Commissioning_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UpperDurationThresh, LowerDurationThresh, UserDefinedDuration, gap, position_duration_bfr_centr, TriggerPeakFreqLowerCutoff=0, TriggerPeakFreqUpperCutoff=8192, targetUpperSNR_thre=np.inf)
:param df: GravitySpy meta data in pandas format
:param Epochstart: starting time of an epoch
:param Epochend: end time of an epoch
:param Commissioning_lt: commissioning time in list
:param TargetGlitchClass: a target glitch class name (str)
:param IFO: a type of interferometer (H1, L1, V1) (str)
:param BGSNR_thre: an upper threhold of SNR for background glitches, i.e., quiet enough ), float or int
:param targetSNR_thre: a lower threshold of SNR for target glitches, float or int
:param Confidence_thre: a threshold of confidence level (float or int )
:param UpperDurationThresh: an upper bound of duration in sec (float or int)
:param LowerDurationThresh: a lower bound of duration in sec (float or int)
:param UserDefinedDuration: user defined duration of a glitch (float or int), 0 in default
:param gap: a time gap between the target and the background segments in sec, 1 sec in default
:param position_duration_bfr_centr: proportion of duration for a target segment around a center time, e.g, 0.5 indicates duration is even distributed around a center time, 0.83 indicates 5/6 is before the center time
:param TriggerPeakFreqLowerCutoff: a lower limit cutoff value of the peak frequency of triggers given by an ETG for target glitches
:param TriggerPeakFreqUpperCutoff: an upper limit cutoff value of the peak frequency of triggers given by an ETG queries for target glitches
:param targetUpperSNR_thre: an upper limit cutoff value of SNR of triggers given by an ETG queries for target glitches
:param flag: 'Both' or 'Either' taking both backgronds or either the preceding or the following background, respectively to accept glitches
:return: the list of parameters of glitches passing the above thresholds
Listsegments contains of
ListIndexSatisfied: a list of index of glitches
Listtarget_timeseries_start: a list of a target glitch starting time
Listtarget_timeseries_end: a list of a target glitch ending time
Listpre_background_start: a list of preceding background starting time
Listpre_background_end; a list of preceding background ending time
Listfol_background_start: a list of following background starting time
Listfol_background_end: a list of following background ending time
Listgpstime: a list of GPS times
Listduration: a list of durations
ListSNR: a list of SNRs
Listconfi: a list of confidence levels
ListID: a list of IDs
'''
df_list = []
# take the data in an Epoch
for Epoch in Epoch_lt:
StartT, EndT = Epoch[0], Epoch[1]
df_epoch = df[(df['GPStime'] >= StartT) & (df['GPStime'] <= EndT)]
df_list.append(df_epoch)
df_int = pd.concat(df_list)
#### improvement for duration 2019-04-05 #############
#### full durations are too long for capturing the glitch signature so that the solution is to choose T90
### In this situation, a gap that is manually chosen will be gap_new = gap_m + 0.05*T100
FractionEnergy = 1. # (90% of energy )
# note (1 - FractionEnergy)/2. is the fraction of the one-sided off T90
# df_int = df_int.assign(gap = gap + (1 - FractionEnergy)/2.*df_int['duration'])
df_int = df_int.assign(gap=[gap] * df_int.shape[0])
df_int['duration'] = np.round(FractionEnergy*df_int['duration'], 3) # replace full durations (T100) with T90
# make the target segments
# I reserve duration +/- gap around a GPS time of a glitch
if UserDefinedDuration > 0: # accept a user defined duration
# redefine only the duration of the target glitch
df_int.loc[(df_int['ifo']==IFO) & (df_int['label']==TargetGlitchClass), ['duration']] = UserDefinedDuration
else:
pass
df_int = df_int.assign(reserved_time_start = df_int['GPStime'] - position_duration_bfr_centr * df_int['duration'] - df_int['gap'])
df_int = df_int.assign(reserved_time_end = df_int['GPStime'] + (1 - position_duration_bfr_centr) * df_int['duration'] + df_int['gap'])
# reserving_time does not include gap
#df_int = df_int.assign(reserved_time_start = df_int['GPStime'] - position_duration_bfr_centr * df_int['duration'])
#df_int = df_int.assign(reserved_time_end = df_int['GPStime'] + (1 - position_duration_bfr_centr) * df_int['duration'])
# exclude glitches with SNR <= BGSNR from df_int as they are quiet enough to be considered as background
df_int = df_int[df_int['snr'] >= BGSNR_thre] # I think this is ok 2018-12-15
# data set containing only a target glitch class
try: # gravity spy and omicron trigger mode
df_glitch = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['peakFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['peakFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]
except KeyError: # pyCBC mode as there is no column "peakFreq"
# add central frequency cuts
df_glitch = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['centralFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['centralFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]
# for cWB triggers
# df_glitch = df_int[(df_int['rho'] > 9) & (df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['centralFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['centralFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]
print('all the glitches in the class: {}'.format(df_glitch.shape[0]))
# the list of values satisfied as secure
ListIndexSatisfied = []
Listtarget_timeseries_start = []
Listtarget_timeseries_end = []
Listpre_background_start = []
Listpre_background_end = []
Listfol_background_start = []
Listfol_background_end = []
Listgpstime = []
Listduration = []
ListSNR = []
Listconfi = []
ListID = []
for ind in range(len(df_glitch)):
# target glitch's peak GPS time and the duration
gpstime = df_glitch['GPStime'].iloc[ind] # a peak GPS time
duration = df_glitch['duration'].iloc[ind] # # a duration
SNR = df_glitch['snr'].iloc[ind] # snr
confi = df_glitch['confidence'].iloc[ind] # confidence level
ID = df_glitch['id'].iloc[ind] # ID
gap_individual = df_glitch['gap'].iloc[ind]
if duration > 0.02: # accept the duration of glitch is greater than 0.02 sec
pass # go to the next line
else: # if the duration of a glitch is less than 0.02 sec
continue # go to the next iteration
# the starting and ending time of the target glitch
target_timeseries_start = gpstime - duration * position_duration_bfr_centr
target_timeseries_end = target_timeseries_start + duration
# take the background
# the background is around the target glitch
# preceding background, which is a nearest neighbor segment before target_timeseries_start
# following background, which is a nearest neighbor segment after target_timeseries_end
background_duration = 64 # duration # duration of both the proceeding and following background
pre_background_start = target_timeseries_start - gap_individual - background_duration
pre_background_end = pre_background_start + background_duration #target_timeseries_start - gap_individual
fol_background_start = target_timeseries_end + gap_individual
fol_background_end = fol_background_start + background_duration #target_timeseries_end + gap_individual + background_duration
print('{:.1f}, {:.1f}, {:.1f}'.format(pre_background_end - pre_background_start,
fol_background_end - fol_background_start, duration))
if round(pre_background_end - pre_background_start, 3) < round(duration, 3) or round(
fol_background_end - fol_background_start, 3) < round(duration, 3):
sys.exit()
ListIndexSatisfied.append(ind) # append the index satisfied as secure
Listtarget_timeseries_start.append(target_timeseries_start)
Listtarget_timeseries_end.append(target_timeseries_end)
Listpre_background_start.append(pre_background_start)
Listpre_background_end.append(pre_background_end)
Listfol_background_start.append(fol_background_start)
Listfol_background_end.append(fol_background_end)
Listgpstime.append(gpstime)
Listduration.append(duration)
ListSNR.append(SNR)
Listconfi.append(confi)
ListID.append(ID)
# zipped list of segments
Listsegments = list(
zip(ListIndexSatisfied, Listtarget_timeseries_start, Listtarget_timeseries_end, Listpre_background_start,
Listpre_background_end, Listfol_background_start, Listfol_background_end, Listgpstime, Listduration,
ListSNR, Listconfi, ListID))
return Listsegments
[docs]def Multiprocess_whitening_for_target(full_timeseries, target_timeseries_start, target_timeseries_end):
'''
description:
This is used for multi processing for whitening segments
USAGE: whitened_fft_target, sample_rate, DURATION = Multiprocess_whitening_for_target(full_timeseries, target_timeseries_start, target_timeseries_end)
:param full_timeseries: time series comprising target and BGs
:param target_timeseries_start: a start time of a target segment
:param target_timeseries_end: an end time of a target segment
:return: whitened_fft_target: whitened fft of a target segment
sample_rate: sampling rate of this channel
DURATION: a duration of a target segment
'''
sample_rate = full_timeseries.sample_rate.value
DURATION = target_timeseries_end - target_timeseries_start
fourth_decimal_num_of_duration = (DURATION * 1000 - floor(DURATION * 1000)) * 10 # fourth decimal number
DURATION = DURATION - fourth_decimal_num_of_duration * 0.0001 # truncate at the fourth decimal number
# revise the duration to make an error in whitening process not not to occur
# ValueError occurs in the whitening when the lengths of whitened data, whitening data and the filter are different
third_decimal_num_of_duration = round((round(DURATION, 3) - round(DURATION, 2)) * 1000)
if third_decimal_num_of_duration % 2 == 1: # odd number at the third decimal point
DURATION = DURATION - 0.001 # make the duration with an even number at the third decimal point
elif third_decimal_num_of_duration % 2 == 0: # even number at the third decimal point
pass # keep the original duration as it is
DURATION = round(DURATION, 3) # round it up at 3rd decimal point as the above three lines creates small number below the 3rd decimal point
full_timeseries_start = full_timeseries.span[0]
full_timeseries_end = full_timeseries.span[1]
ave_background_asd = full_timeseries.crop(end=full_timeseries_start+128).asd(DURATION, DURATION * 0.5, method='median').value
# whitening Target
target_timeseries = full_timeseries.crop(start=target_timeseries_start, end=target_timeseries_end)
if (full_timeseries.value == 0).all() is True: # if the time series is all zero
whitened_fft_target = np.zeros(ave_background_asd.shape[0])
elif ave_background_asd[ave_background_asd != 0].shape[0] == 0: # if the time series is all zero
whitened_fft_target = np.zeros(ave_background_asd.shape[0])
else:
fft_target = target_timeseries.asd(DURATION, DURATION * 0.5, method='median').value # fft of the target
# whitening the target using the average BG
whitened_fft_target = fft_target / ave_background_asd
return whitened_fft_target, sample_rate, DURATION
[docs]def SaveOnlyTargetHDF5_OFFLINE(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, PlusHOFT='False'):
'''
description:
THIS IS USED FOR "OFFLINE" MODE
0. assuming Listsegments is given by Findglitchlist()
1.take the information of the list of allowed target and a preceding and following segments
2. whiten a target segmement based on the average background segment
3. find whitened FFT
4. save the whitened target and backgrounds FFTs
Note this depends on
USAGE: SaveOnlyTargetHDF5_OFFLINE(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, PlusHOFT='False')
:param Listsegments: a list of segment parameters
:param re_sfchs: a list of safe channels
:param outputpath: a directory of an output file
:param outputfilename: a name of an output file
:param number_process: a number of processes in parallel
:param PlusHOFT: whether to get data of hoft, {'True' or 'False'}, 'False' in default
'''
# check if the output directory exists
if os.path.isdir(outputpath) is False:
print("The directory for the output HDF5 file does not exist: path = {}".format(outputpath))
print("Please specify the existing directory for it.")
sys.exit()
# the list of the safe channels
channels = ['{0}:{1}'.format(IFO, ch) for ch in re_sfchs]
# the path and the file name
outputPathFileName = os.path.join(outputpath, outputfilename)
# make an output file
try:
f = h5py.File(outputPathFileName, 'w-')
f.close()
except IOError:
print('The file already exists in {}'.format(outputPathFileName))
print('Please choose a different file name in a configuration file or delete this file if you want to overwrite it')
sys.exit()
# remove the astro cashe
if os.path.isfile('~/.astro/cashe/downlowd/py3/lock') is True:
os.remove('~/.astro/cashe/downlowd/py3/lock')
# open a HDF5 file in writing mode
with h5py.File(outputPathFileName, 'r+') as f:
for IndexSatisfied, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end, gpstime, duration, SNR, confi, ID in Listsegments:
t0 = time.time()
try: # deal with time series in bad state
if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background
full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background
full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background
full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
else:
pass
channels_used = channels
if PlusHOFT == 'True': # get the data of hoft
if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background
full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background
full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background
full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
else:
pass
full_timeseriesDict.append(full_timeseriesDict_hoft) # append
del full_timeseriesDict_hoft # delete the hoft object
channels_used = np.append(channels, '{}:GDS-CALIB_STRAIN'.format(IFO))
# =====================================================
except IndexError:
print('Bad data, go to a next GPS')
continue # go to a next GPS time
except RuntimeError:
print('RuntimeError, go to a next GPS')
continue
except IOError: # error when getting time series
print('IOError, go to a next GPS')
continue
except ValueError: # ValueError: Failed to read /hdfs/frames/O3/raw/H1/H-H1_R-12384/H-H1_R-1238489600-64.gwf: Unable to map from the stream class id (0) to internal class id at position: 18446744073709551601 [streamref: class: 0 instance: 0 length: 0 ]
print('ValueError, go to a next GPS')
continue
g = f.create_group('gps{:05d}'.format(IndexSatisfied))
g.attrs['GPS'] = gpstime # a peak GPS time
g.attrs['duration'] = duration
g.attrs['snr'] = SNR # snr
g.attrs['confidence'] = confi # confidence
g.attrs['id'] = ID
g.attrs['ifo'] = channels[0].split(':')[0]
try:
ch_labels = range(len(channels_used)) # a list of channel labels
List_whitened_fft_target = [] # a list of whitened fft of target
List_sample_rate = [] # a list of sample rates
List_DURATION = [] # a list of durations
# whitened target and BG segments with multi processes
with concurrent.futures.ProcessPoolExecutor(max_workers=number_process) as executor:
for whitened_fft_target, sample_rate, DURATION in executor.map(Multiprocess_whitening_for_target,
full_timeseriesDict.values(),
itrtls.repeat(target_timeseries_start),
itrtls.repeat(target_timeseries_end)
):
List_whitened_fft_target.append(whitened_fft_target) # fill
List_sample_rate.append(sample_rate) # fill
List_DURATION.append(DURATION) # fill
for ch_label in ch_labels: # iterate through channel labels
dsetTarget = g.create_dataset("chT{:05d}".format(ch_label), data=List_whitened_fft_target[ch_label], compression="gzip", compression_opts=9) # target
dsetTarget.attrs['SamplingRate'] = List_sample_rate[ch_label] # target
dsetTarget.attrs['channel'] = channels_used[ch_label].split(':')[1] # target
dsetTarget.attrs['duration'] = List_DURATION[ch_label] # target
except ValueError:
del f['gps{:05d}'.format(IndexSatisfied)]
print('*** delete group gps{:05d} due to ASD estimation error ***'.format(IndexSatisfied))
continue # go to a next GPS time
except IOError: # IOError: Driver write request failed
del f['gps{:05d}'.format(IndexSatisfied)]
print('*** delete group gps{:05d} due to memory error ***'.format(IndexSatisfied))
continue # go to a next GPS time
print('Complete saving a sample as gps{:05d}'.format(IndexSatisfied))
t1 = time.time()
print('Time taken: {} s'.format(t1 - t0))
[docs]def cal_importance_single_channel_singl_freqband_from_bg_up_bd_prior(whitened_fft_target, sampling_rate, DURATION, poly_para, LowerCutOffFreq, UpperCutOffFreq):
"""
description:
calculate the importance for a single channel in a given frequency band
USAGE: Count = cal_importance_single_channel_singl_freqband_from_bg_up_bd_prior(whitened_fft_target, sampling_rate, DURATION, poly_para, LowerCutOffFreq, UpperCutOffFreq)
:param whitened_fft_target: on-source window normalized spectrum
:param sampling_rate: sample rate
:param DURATION: duration of on-source window
:param poly_para: polynomial parameters of the fit of the background upper threshold as a function of on-source window length
:param LowerCutOffFreq:
:param UpperCutOffFreq:
:return:
"""
sp = whitened_fft_target
bg_up_thresh_fitted = np.poly1d(poly_para)
GausCeiling = bg_up_thresh_fitted(DURATION)
if np.isnan(sp).any() != True: # if sp has data measured
# low, high, or band pass filter if we know which frequency band is important
if LowerCutOffFreq != 'None' and UpperCutOffFreq == 'None': # high-pass filter
## pT, pP, and pF should be zero if LowerCutOffFreq is greater than the half of the sampling rate as there is no data. However pT would be non-zero due to the bad resolution (a very short giltch)
if sampling_rate / 2 <= LowerCutOffFreq:
sp = np.array([0])
else:
LowerCutOffElement = int(LowerCutOffFreq * DURATION)
if LowerCutOffElement <= sp.shape[-1]: # if LowCutOffElement is within the length of the vector
sp = sp[LowerCutOffElement:]
else: # if LowerCutOffElement is greater than the length of the vector
sp = np.array([0])
elif LowerCutOffFreq == 'None' and UpperCutOffFreq != 'None': # low-pass filter
# even if UpperCutOffElement is greater than the length of the vector is works fine
UpperCutOffElement = int(UpperCutOffFreq * DURATION)
sp = sp[:UpperCutOffElement]
elif LowerCutOffFreq != 'None' and UpperCutOffFreq != 'None': # band pass filter
## pT, pP, and pF should be zero if LowerCutOffFreq is greater than the half of the sampling rate as there is no data. However pT would be non-zero due to the bad resolution (a very short giltch)
if sampling_rate / 2 <= LowerCutOffFreq:
sp = np.array([0])
else:
LowerCutOffElement = int(LowerCutOffFreq * DURATION)
UpperCutOffElement = int(UpperCutOffFreq * DURATION)
if LowerCutOffElement <= sp.shape[-1]: # if LowCutOffElement is within the length of the vector
sp = sp[LowerCutOffElement:UpperCutOffElement]
else: # if LowCutOffElement is greater than the length of the vector
sp = np.array([0])
else:
pass
if GausCeiling == 0:
Count = 0
else:
Count = sp[sp > GausCeiling].shape[0] / float(sp.shape[0])
else:
Count = 0 # manually put zero because this channel has no contribution at this glitch ID
return Count
[docs]def HierarchyChannelAboveThreshold_single_channel_multiband_from_bg_up_bd_prior(whitened_fft_target, sampling_rate, duration, list_poly_para):
'''
description:
calculate the importance for a signle channel for frequency bands
USAGE: Counts_in_multibands = HierarchyChannelAboveThreshold_single_channel_multiband_from_bg_up_bd_prior(whitened_fft_target, sampling_rate, duration, list_poly_para)
:param whitened_fft_target: whitened fft of the on-source window
:param duration: a duration of the on-source window
:param sampling_rate: sampling rate of a channel
:param list_poly_para: a list of polynomial fit of the background upper threshold per freq band
:return: Counts_in_multibands: values of importance in different frequency bands, numpy array
'''
# iterate different frequency bands in the list
Counts_in_multibands = np.array([cal_importance_single_channel_singl_freqband_from_bg_up_bd_prior(whitened_fft_target, sampling_rate, duration, list_poly_para[freq_band_ind], LowerCutOffFreq=Const.freq_bands[freq_band_ind][0], UpperCutOffFreq=Const.freq_bands[freq_band_ind][1]) for freq_band_ind in range(len(Const.freq_bands))])
return Counts_in_multibands
[docs]def CreateRho_single_channel_multiband_from_bg_up_bd_prior(full_timeseries, target_timeseries_start, target_timeseries_end, list_poly_para):
'''
discription:
1. calculate the normalized spectrum
2. compute the value of importance for a single channel across frequency bands
USAGE: Counts_in_multibands, sample_rate = CreateRho_single_channel_multiband_from_bg_up_bd_prior(full_timeseries, target_timeseries_start, target_timeseries_end, list_poly_para)
:param full_timeseries: the full time series in gwpy object including on- and off source windows
:param target_timeseries_start: the start time of the on-source window
:param target_timeseries_end: the end time of the on-source window
:param list_poly_para: a list of polynomial fit of the background upper threshold per freq band
:return:
Counts_in_multibands: values of importance in different frequency bands where importance is a fraction of frequency bins in a frequency range above an upper bound of the off-source window for a single channel
sample_rate: a sampling rate of a single channel
'''
# calculate the whitened FFT of the on- and off-source windows
whitened_fft_target, sample_rate, duration = Multiprocess_whitening_for_target(full_timeseries, target_timeseries_start, target_timeseries_end)
# calculate the importance
Counts_in_multibands = HierarchyChannelAboveThreshold_single_channel_multiband_from_bg_up_bd_prior(whitened_fft_target, sample_rate, duration, list_poly_para)
return Counts_in_multibands, sample_rate
[docs]def CreateAllChannels_rho_multband_from_bg_up_bd_prior(Listsegment, IFO, re_sfchs, number_process, PlusHOFT, hdf5_obj_bg_up_thresh):
'''
description:
1. use a single glitch time
2. query timeseries of all the channels around a glitch
3. calculate the normalized spectrum
4. compute the value of importance for a single channel across frequency bands
USAGE: IndexSatisfied, Mat_Count_in_multibands, list_sample_rates, re_sfchs, gpstime, duration, SNR, confi, ID = CreateAllChannels_rho_multband_from_bg_up_bd_prior(Listsegment, IFO, re_sfchs, number_process, PlusHOFT, hdf5_obj_bg_up_thresh)
:param Listsegment: a list of segment parameters
:param IFO
:param channels: a list of safe channels
:param number_process: a number of processes in parallel
:param PlusHOFT: whether to get data of hoft, {'True' or 'False'}
:param hdf5_obj_bg_up_thresh: a HDF5 object that contains the polynomial parameters of the fit that represent the background upper theshold as a functin of on-source window length for all the channels and freq bands
:return:
IndexSatisfied: glitch index
Mat_Count_in_multibands: a matrix of rho where frequencies in rows and channels in columns, numpy array
list_sample_rates: a list of sampling rates of channels, numpy array
re_sfchs: list of channels without "IFO:" at the beginning
gpstime: a gps time
duration: a value of duration
SNR: signal to noise ratio in the h(t)
conf: a confidence level of classification of a glitch, provided by Gravity Spy. Otherwise None
ID: a glitch ID, provided by Gravity Spy in usual
'''
# get the polynomical parameters fitted the background upper threshold across channels and the frequency bands
poly_para_dict = {}
list_g = list(hdf5_obj_bg_up_thresh.keys())
num_freq_band = len(list(hdf5_obj_bg_up_thresh[list_g[0]].keys()))
for ch in hdf5_obj_bg_up_thresh.keys():
g = hdf5_obj_bg_up_thresh[ch]
poly_para_dict[ch] = [g["BP{:d}".format(freq_band_ind)][()] for freq_band_ind in range(num_freq_band)]
channels = ['{0}:{1}'.format(IFO, ch) for ch in re_sfchs]
# reorder the keys in the backgroud upper thresh dictionary
poly_para_dict = {ch: poly_para_dict[ch] for ch in re_sfchs}
IndexSatisfied, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end, gpstime, duration, SNR, confi, ID = Listsegment
t0 = time.time()
# extracting the target data and the proceeding and following backgrounds
try: # deal with time series in bad state
if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background
full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background
full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background
full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
else:
pass
if PlusHOFT == 'True': # get the data of hoft
if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background
full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background
full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background
full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
else:
pass
full_timeseriesDict.append(full_timeseriesDict_hoft) # append
del full_timeseriesDict_hoft # delete the hoft object
re_sfchs = np.append(re_sfchs, 'GDS-CALIB_STRAIN')
except IndexError as err:
print("IndexError: {}".format(err.message))
# IndexError: After reading object of type: FrAdcData stream is no longer in good state. ( Frmae: 0 ChannelType: L1:ASC-X_TR_A_NSUM_OUT_DQ offset: 932092099)
return 0, 0, 0, 0, 0, 0, 0, 0, 0
except RuntimeError as err:
print("RuntimeError: {}".format(err.message))
return 0, 0, 0, 0, 0, 0, 0, 0, 0
except IOError as err: # error when getting time series
print('IOError: {}'.format(err.message))
return 0, 0, 0, 0, 0, 0, 0, 0, 0
except ValueError: # ValueError: Failed to read /hdfs/frames/O3/raw/H1/H-H1_R-12384/H-H1_R-1238489600-64.gwf: Unable to map from the stream class id (0) to internal class id at position: 18446744073709551601 [streamref: class: 0 instance: 0 length: 0 ]
print('ValueError')
return 0, 0, 0, 0, 0, 0, 0, 0, 0
try:
Mat_Count_in_multibands = np.array([]) # a list of rho
list_sample_rates = np.array([])
# whitened target and BG segments with multi processes
with concurrent.futures.ProcessPoolExecutor(max_workers=number_process) as executor:
for Counts_in_multibands, sample_rate in executor.map(CreateRho_single_channel_multiband_from_bg_up_bd_prior,
full_timeseriesDict.values(),
itrtls.repeat(target_timeseries_start),
itrtls.repeat(target_timeseries_end),
poly_para_dict.values()
):
# fill
#Mat_Count_in_multibands = np.vstack([Mat_Count_in_multibands, Counts_in_multibands]) if Mat_Count_in_multibands.size else Counts_in_multibands
Counts_in_multibands = Counts_in_multibands.reshape(Counts_in_multibands.shape[0], 1)
Mat_Count_in_multibands = np.hstack([Mat_Count_in_multibands, Counts_in_multibands]) if Mat_Count_in_multibands.size else Counts_in_multibands
list_sample_rates = np.append(list_sample_rates, sample_rate)
except ValueError:
print('*** delete group gps{:05d} due to ASD estimation error ***'.format(IndexSatisfied))
return 0, 0, 0, 0, 0, 0, 0, 0, 0
except IOError: # IOError: Driver write request failed
print('*** delete group gps{:05d} due to memory error ***'.format(IndexSatisfied))
return 0, 0, 0, 0, 0, 0, 0, 0, 0
t1 = time.time()
print('Time taken: {} s'.format(t1 - t0))
return IndexSatisfied, Mat_Count_in_multibands, list_sample_rates, re_sfchs, gpstime, duration, SNR, confi, ID # pass the parameters
[docs]def SaveOnlyTargetHDF5_multiband_OFFLINE_from_bg_up_bd_prior(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, path_hdf5_bg_up_thresh, PlusHOFT='False'):
'''
description:
THIS IS USED FOR "OFFLINE" MODE
1. take the information of the list of allowed target and a preceding and following segments
2. query time series and get normalized spectrum and calculate importance
4. save the whitened target and backgrounds FFTs
Note this depends on
USAGE: SaveOnlyTargetHDF5_multiband_OFFLINE_from_bg_up_bd_prior(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, path_hdf5_bg_up_thresh, PlusHOFT='False')
:param Listsegments: a list of segment parameters
:param re_sfchs: a list of safe channels
:param outputpath: a directory of an output file
:param outputfilename: a name of an output file
:param number_process: a number of processes in parallel
:param path_hdf5_bg_up_thresh: path to a HDF5 file that contains the polynomial fit of the background upper threshold
:param PlusHOFT: whether to get data of hoft, {'True' or 'False'}, 'False' in default
:return None
'''
# load a HDF5 file that contains the polynomical parameters extrapolating the background upper threshold
if os.path.isfile(path_hdf5_bg_up_thresh) is False:
print("No such file for the BG upper threshold: {}".format(path_hdf5_bg_up_thresh))
f_bg_up_thresh = h5py.File(path_hdf5_bg_up_thresh, 'r')
# the path and the file name
outputPathFileName = os.path.join(outputpath, outputfilename)
# make an output file
try:
f = h5py.File(outputPathFileName, 'w-')
f.close()
except IOError:
print('The file already exists in {}'.format(outputPathFileName))
print('Please choose a different file name in the configuration file or delete this file if you want to overwrite it')
sys.exit()
# remove the astro cashe
if os.path.isfile('~/.astro/cashe/downlowd/py3/lock') is True:
os.remove('~/.astro/cashe/downlowd/py3/lock')
# open a HDF5 file in writing mode
with h5py.File(outputPathFileName, 'r+') as f:
for Listsegment in Listsegments:
IndexSatisfied, Mat_Count_in_multibands, list_sample_rates, re_sfchs_used, gpstime, duration, SNR, confi, ID = CreateAllChannels_rho_multband_from_bg_up_bd_prior(Listsegment, IFO, re_sfchs, number_process, PlusHOFT, f_bg_up_thresh)
if type(duration) == int: # if some error occurs, List_rho is 0, go to the next iteration
continue
else:
g = f.create_group('gps{:05d}'.format(IndexSatisfied))
g.attrs['GPS'] = gpstime # a peak GPS time
g.attrs['duration'] = duration
g.attrs['snr'] = SNR # snr
g.attrs['confidence'] = confi # confidence
g.attrs['id'] = ID
g.attrs['ifo'] = IFO
dset_mat_rho = g.create_dataset("mat_rho", data=Mat_Count_in_multibands, compression="gzip", compression_opts=9) # a matrix of rho
dset_sample_rates = g.create_dataset("sample_rates", data=list_sample_rates, compression="gzip", compression_opts=9) # list of sample rates of channels
dset_channels = g.create_dataset("channels", data=[re_sfch.encode('utf8') for re_sfch in re_sfchs_used], compression="gzip", compression_opts=9) # list of channels
freq_bands = np.array(Const.freq_bands).tolist()
for i in range(len(freq_bands)):
for j in range(len(freq_bands[0])):
freq_bands[i][j] = freq_bands[i][j].encode('utf8')
dset_freq_bands = g.create_dataset("freq_bands", data=freq_bands, compression="gzip", compression_opts=9) # matrix of frequency bnads
#dsetTarget.attrs['duration'] = List_DURATION[ch_label] # target
with h5py.File(outputPathFileName, 'r') as f:
print(f['gps{:05d}'.format(IndexSatisfied)]['mat_rho'][()] )
#
#
# outputpath = "/home/kentaro.mogushi/public_html/longlive/Plots/GravitySpy/test_extendBG/null_m/HDF5data"
# outputfilename = "MagnetometerL1.h5"
#
# list_all_dummy_duration_sorted_file, mat_ch_sorted_dict_file = make_bg_up_thesh_interpolate_hdf5(outputpath, outputfilename)
# save_interpolate_bg_upper_thres_hdf5(list_all_dummy_duration_sorted_file, mat_ch_sorted_dict_file, '.', 'bg_up_thre.h5')
#
#
#
# #%%%%%%%%%%%%%%%%
#
#
# #------
# # target
# #---------
# path = "/home/kentaro.mogushi/public_html/longlive/Plots/GravitySpy/test_extendBG/target/HDF5data/MagnetometerL1.h5"
# f_t = h5py.File(path, 'r')
# list_g = list(f_t.keys())
# # --- channels ----
# list_channels = np.array([])
# g = f_t[list_g[0]]
# for dset_ind in g.keys():
# if dset_ind[:2] != "ch":
# continue
# channel = g[dset_ind].attrs["channel"]
# list_channels = np.append(list_channels, channel)
#
#
#
# ch = 'PEM-EX_MAG_EBAY_SUSRACK_QUAD_SUM_DQ'
# #ch = 'ASC-SRC2_Y_OUT_DQ'
# ch_ind_int = np.where(list_channels == ch)[0][0]
# list_sp_mean = np.array([])
# list_sp_std = np.array([])
# list_duration = np.array([])
# list_gpstime = np.array([])
# list_sp = []
# for gps in f_t.keys():
# g = f_t[gps]
# sp = g['chT{:05d}'.format(ch_ind_int)][()]
# list_sp.append(sp)
# sp_mean = sp.mean()
# sp_std = sp.std(ddof=1)
# gpstime = g.attrs['GPS']
# list_gpstime = np.append(list_gpstime, gpstime)
# duration = g['chT{:05d}'.format(ch_ind_int)].attrs['duration']
# list_sp_mean = np.append(list_sp_mean, sp_mean)
# list_sp_std = np.append(list_sp_std, sp_std)
# list_duration = np.append(list_duration, duration)
#
#
#
# #-----------
# # priori
# #------------
# outputpath = "/home/kentaro.mogushi/public_html/longlive/Plots/GravitySpy/test_extendBG/null_m/HDF5data"
# outputfilename = "MagnetometerL1.h5"
#
#
# path_outputfile = os.path.join(outputpath, outputfilename)
#
#
# f = h5py.File(path_outputfile, 'r')
# list_g = list(f.keys())
#
# #-------------------
# # get the list of channels
# #---------------------
# list_channels = np.array([])
# g = f[list_g[0]]
# for dset_ind in g.keys():
# if dset_ind[:2] != "ch":
# continue
# channel = g[dset_ind].attrs["channel"].decode('utf8')
# list_channels = np.append(list_channels, channel)
# list_freq_bands = g['freq_bands'][()]
# #----------------
# # get dummy durations
# # freq bands
# #----------------
#
#
# ch_ind_int = np.where(list_channels == ch)[0][0]
#
# # -- get the backgournd upper thresshod for all the glitches and all the channels
# mat_ch_dict = {}
# list_all_dummy_duration = np.array([])
# for gps_ind in f.keys(): # iterate glitches
# #if gps_ind != list_g[0]:
# # continue
# g = f[gps_ind]
# list_dummy_duration = g['dummy_durations'][()]
# list_all_dummy_duration = np.append(list_all_dummy_duration, list_dummy_duration)
# for ch_ind in range(len(list_channels)): # iterate channels
# #if ch_ind != ch_ind_int:
# # continue
# #print(ch_ind)
# bg_up_mat = g["ch{:05d}".format(ch_ind)][()]
# if gps_ind == list_g[0]:
# mat_ch_dict[list_channels[ch_ind]] = bg_up_mat
# else:
# mat_ch_dict[list_channels[ch_ind]] = np.hstack([mat_ch_dict[list_channels[ch_ind]], bg_up_mat]) if mat_ch_dict[list_channels[ch_ind]].size else bg_up_mat
#
#
# #-- short the upper threshold ascendingly in the order of dummy durations
# sorted_index = np.argsort(list_all_dummy_duration) # sort index ascending
# list_all_dummy_duration_sorted = list_all_dummy_duration[sorted_index]
# mat_ch_sorted_dict = {}
# for channel in mat_ch_dict.keys():
# mat_ch_dict[channel][:, sorted_index]
# mat_ch_sorted_dict[channel] = mat_ch_dict[channel][:, sorted_index]
#
#
# # -- fit
#
# file_bg_up_th_hdf5 = 'bg_up_thre.h5'
# try:
# f = h5py.File(file_bg_up_th_hdf5, 'w-')
# f.close()
# except IOError:
# print('The file already exists in {}'.format(file_bg_up_th_hdf5))
# os.system('rm {}'.format(file_bg_up_th_hdf5))
# # open a HDF5 file in writing mode
# with h5py.File(file_bg_up_th_hdf5, 'a') as f:
# for channel in list_channels:
# print(channel)
# g = f.create_group(channel)
# for band_ind in range(bg_up_mat.shape[0]):
# print('freq band {}'.format(band_ind))
# # remove nan samples
# bg_up_bd_nonnan_ind = np.where(np.isnan(mat_ch_sorted_dict[ch][band_ind, :]) == False)[0]
# sample_x = list_all_dummy_duration_sorted[bg_up_bd_nonnan_ind]
# sample_y = mat_ch_sorted_dict[ch][band_ind, :][bg_up_bd_nonnan_ind]
# # remove extreme large outliers
# median = np.median(sample_y)
# med_abs_dev = stats.median_absolute_deviation(sample_y)
# if med_abs_dev == 0: # if all the values are zero, med_abs_dev will be zero
# med_abs_dev = 1
# non_ext_outliner_ind = np.where((median - 6 * med_abs_dev < sample_y) & (sample_y < median + 6 * med_abs_dev))[0]
# sample_x = sample_x[non_ext_outliner_ind]
# sample_y = sample_y[non_ext_outliner_ind]
# z = np.polyfit(sample_x, sample_y, 10)
# print('0000')
# dset = g.create_dataset("BP{band_ind:d}".format(band_ind=int(band_ind)), data=z, compression="gzip", compression_opts=9)
# print('11111')
# dset.attrs['channel'] = channel.encode('utf8')
# print('22222')
# dset.attrs['freqband'] = int(band_ind)
#
#
#
#
# plot
#
# f_up = h5py.File("/home/kentaro.mogushi/public_html/longlive/Plots/GravitySpy/test_extendBG/bg_up_thre.h5", 'r')
# for band_ind in range(8)[7:]:
# plt.plot(list_all_dummy_duration_sorted, mat_ch_sorted_dict[ch][band_ind, :], label='band {}'.format(band_ind), color='gray', alpha=0.5)
# # remove nan samples
# bg_up_bd_nonnan_ind = np.where(np.isnan(mat_ch_sorted_dict[ch][band_ind, :]) == False)[0]
# sample_x = list_all_dummy_duration_sorted[bg_up_bd_nonnan_ind]
# sample_y = mat_ch_sorted_dict[ch][band_ind, :][bg_up_bd_nonnan_ind]
# # remove extreme large outliers
# median = np.median(sample_y)
# med_abs_dev = stats.median_absolute_deviation(sample_y)
# if med_abs_dev == 0:
# med_abs_dev = 1
# non_ext_outliner_ind = np.where((median - 6 * med_abs_dev < sample_y) & (sample_y < median + 6 * med_abs_dev))[0]
# sample_x = sample_x[non_ext_outliner_ind]
# sample_y = sample_y[non_ext_outliner_ind]
# z = np.polyfit(sample_x, sample_y, 10)
# print(z)
# p = np.poly1d(z)
# plt.plot(list_all_dummy_duration_sorted, p(list_all_dummy_duration_sorted), color='blue', ls='-', alpha=0.5,
# label='polynomial fit')
# # ####
# z = f_up[ch]["BP{:d}".format(band_ind)][()]
# print(z)
# p = np.poly1d(z)
# plt.plot(list_all_dummy_duration_sorted, p(list_all_dummy_duration_sorted), color='blue', ls='--', alpha=0.5, label='polynomial fit from the file')
# #plt.errorbar(list_duration, list_sp_mean, yerr=list_sp_std, fmt='o', color='red')
# plt.legend()
# plt.title(ch)
# plt.yscale('log')
# plt.xscale('log')
# #plt.ylim(median - 6 * med_abs_dev, median + 6 * med_abs_dev)
# plt.ylabel("upper cut")
# plt.xlabel("duration [sec]")
# plt.savefig('{}.png'.format(ch))
# plt.close()
#
# # importance
# list_imp = []
# for t_duration_ind in range(len(list_duration)):
# sp = list_sp[t_duration_ind]
# imp = sp[sp > p(list_duration[t_duration_ind])].shape[0] / float(sp.shape[0])
# list_imp.append(imp)
#
# plt.scatter(list_duration, list_imp)
# plt.title(ch)
# plt.xscale('log')
# plt.ylabel("rho")
# plt.xlabel("duration [sec]")
# plt.savefig("imp_{}.png".format(ch))
# plt.close()
#
#
#
#
#
#
#
#
#
# gps = 1262368238
# gps1 = 1262369268
# d = 64
# fft_length = 4
# gwdata = TimeSeries.get('L1:GDS-CALIB_STRAIN', gps-d/2., gps+d/2.)
# duration = 3.45
# on = gwdata.crop(end=gwdata.span[0] + duration)
# spectrum = gwdata.asd(fft_length, fft_length /2, method='median')
# interpolated_spectrum = spectrum.interpolate(round(1/duration, 5))
#
#
# on_asd = on.asd(duration, round(duration /2, 3), method='median')
#
#
# print("spectrum df= {} Hz".format(spectrum.df))
# print("interpolated spectrum df= {} Hz".format(interpolated_spectrum.df))
# print("on spectrum df= {} Hz".format(on_asd.df))
#
#
# gps = 1262368238 # glitchy
# gps1 = 1262369268 # quiet
# d = 64
# gwdata = TimeSeries.get('L1:GDS-CALIB_STRAIN', gps-d/2., gps+d/2.)
# gwdata1 = TimeSeries.get('L1:GDS-CALIB_STRAIN', gps1-d/2., gps1+d/2.)
# gwdata2 = TimeSeries.get('L1:GDS-CALIB_STRAIN', gps1-1., gps1+1.)
#
#
# list_fft_length = [16, 8, 4]
# list_fft_length = [4]
# list_method = ['median', 'welch']
# #list_method = ['median']
# for fft_length in list_fft_length:
# for method in list_method:
# if method == 'welch':
# ls = '--'
# else:
# ls = '-'
# spectrum = gwdata.asd(fft_length, fft_length /2, method=method)
# spectrum1 = gwdata1.asd(fft_length, fft_length / 2, method=method)
# spectrum2 = gwdata2.asd(fft_length, fft_length / 2, method=method)
# plt.plot(spectrum.frequencies, spectrum.value, label='fft length = {}, method = {}'.format(fft_length, method), alpha=0.5)
# plt.plot(spectrum1.frequencies, spectrum1.value, label='fft length = {}, method = {}, quiet'.format(fft_length, method), alpha=0.5)
# plt.plot(spectrum2.frequencies, spectrum2.value, label='fft length = {}, method = {}, quiet, short'.format(fft_length, method), alpha=0.5)
# plt.title("duration = {} sec".format(d))
# plt.xlim(40, 4000)
# plt.ylabel(r'GW strain ASD [strain$/\sqrt{\mathrm{Hz}}$]')
# plt.ylim(1e-24, 3e-21)
# plt.xscale('log')
# plt.yscale('log')
# plt.legend()
# plt.savefig("spectrum.png")
# plt.close()
# #
# # import time as TIME
# # from gwpy.timeseries import TimeSeries, TimeSeriesDict
# # gps = 1175755932.81
# # pre_background_start = gps - 64
# # fol_background_end = gps + 64
# # IFO = "L1"
# # number_process = 5
# # channels = ["L1:PEM-EX_MAG_EBAY_SUSRACK_QUAD_SUM_DQ"]
# # t0 = TIME.time()
# # full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
# # t1 = TIME.time()
# # print("Time taken in {} sec".format(t1 - t0))
#
#
#
#
#
# import pandas as pd
# import os
# import sys
# import matplotlib
# matplotlib.use("Agg")
# import matplotlib.pyplot as plt
#
#
#
# #------------
# # find the quiet time
# #--------------
#
# path = '/home/kentaro.mogushi/longlived/MachineLearningJointPisaUM/dataset/GravityspyDataSet/O2/gspy-db-20180813.csv'
# df = pd.read_csv(path)
#
# df = df.drop_duplicates(subset=["GPStime", "duration"]).reset_index()
# df = df.assign(reserved_time_start=df['GPStime'] - df['duration'] / 2. - 0.1)
# df = df.assign(reserved_time_end=df['GPStime'] + df['duration'] / 2. + 0.1)
#
#
#
#
#
#
#
#
#
#
#
#
# #
# # for i in range(df.shape[0])[200000:200100]:
# # plt.plot([df.iloc[i]["reserved_time_start"] - res_start.min(), df.iloc[i]["reserved_time_end"] - res_start.min()], [i,i])
# #
# # plt.ylabel("Glitch index")
# # plt.xlabel("on-source window")
# # plt.savefig("glitch.png")
# # plt.close()
#
#
# c = res_start[1:] - res_end[:-1]
#
#
#
#
# df_q = pd.DataFrame()
#
#
# plt.hist(c, bins=50)
# plt.ylabel("Count")
# plt.xlabel("Quiet time length [sec]")
# plt.savefig("quiet_time_length.png")
# plt.close()
#
#
#
#