From 80f46601308bd831a2ea6117f590ce6dee59aad0 Mon Sep 17 00:00:00 2001 From: Jason Kulatunga Date: Sun, 25 Jul 2021 22:11:07 -0700 Subject: [PATCH] validate thresholds whenever SMART data is recieved. --- webapp/backend/pkg/database/interface.go | 2 + .../pkg/database/scrutiny_repository.go | 20 +++ .../backend/pkg/models/measurements/smart.go | 85 +++++---- .../measurements/smart_ata_attribute.go | 167 +++++++++--------- .../models/measurements/smart_attribute.go | 1 + .../measurements/smart_nvme_attribute.go | 51 +++--- .../measurements/smart_scsci_attribute.go | 48 ++--- .../ata_attribute_metadata.go | 2 +- .../nvme_attribute_metadata.go | 2 +- .../scsi_attribute_metadata.go | 2 +- .../pkg/web/handler/get_device_details.go | 8 +- .../pkg/web/handler/upload_device_metrics.go | 12 +- 12 files changed, 237 insertions(+), 163 deletions(-) rename webapp/backend/pkg/{metadata => thresholds}/ata_attribute_metadata.go (99%) rename webapp/backend/pkg/{metadata => thresholds}/nvme_attribute_metadata.go (99%) rename webapp/backend/pkg/{metadata => thresholds}/scsi_attribute_metadata.go (99%) diff --git a/webapp/backend/pkg/database/interface.go b/webapp/backend/pkg/database/interface.go index 5bca804..bc52475 100644 --- a/webapp/backend/pkg/database/interface.go +++ b/webapp/backend/pkg/database/interface.go @@ -2,6 +2,7 @@ package database import ( "context" + "github.com/analogj/scrutiny/webapp/backend/pkg" "github.com/analogj/scrutiny/webapp/backend/pkg/models" "github.com/analogj/scrutiny/webapp/backend/pkg/models/collector" "github.com/analogj/scrutiny/webapp/backend/pkg/models/measurements" @@ -16,6 +17,7 @@ type DeviceRepo interface { RegisterDevice(ctx context.Context, dev models.Device) error GetDevices(ctx context.Context) ([]models.Device, error) UpdateDevice(ctx context.Context, wwn string, collectorSmartData collector.SmartInfo) (models.Device, error) + UpdateDeviceStatus(ctx context.Context, wwn string, status pkg.DeviceStatus) (models.Device, error) GetDeviceDetails(ctx context.Context, wwn string) (models.Device, error) SaveSmartAttributes(ctx context.Context, wwn string, collectorSmartData collector.SmartInfo) (measurements.Smart, error) diff --git a/webapp/backend/pkg/database/scrutiny_repository.go b/webapp/backend/pkg/database/scrutiny_repository.go index a7711c1..bcfa20b 100644 --- a/webapp/backend/pkg/database/scrutiny_repository.go +++ b/webapp/backend/pkg/database/scrutiny_repository.go @@ -3,6 +3,7 @@ package database import ( "context" "fmt" + "github.com/analogj/scrutiny/webapp/backend/pkg" "github.com/analogj/scrutiny/webapp/backend/pkg/config" "github.com/analogj/scrutiny/webapp/backend/pkg/models" "github.com/analogj/scrutiny/webapp/backend/pkg/models/collector" @@ -163,6 +164,17 @@ func (sr *scrutinyRepository) UpdateDevice(ctx context.Context, wwn string, coll return device, sr.gormClient.Model(&device).Updates(device).Error } +//Update Device Status +func (sr *scrutinyRepository) UpdateDeviceStatus(ctx context.Context, wwn string, status pkg.DeviceStatus) (models.Device, error) { + var device models.Device + if err := sr.gormClient.WithContext(ctx).Where("wwn = ?", wwn).First(&device).Error; err != nil { + return device, fmt.Errorf("Could not get device from DB", err) + } + + device.DeviceStatus = pkg.Set(device.DeviceStatus, status) + return device, sr.gormClient.Model(&device).Updates(device).Error +} + func (sr *scrutinyRepository) GetDeviceDetails(ctx context.Context, wwn string) (models.Device, error) { var device models.Device @@ -434,3 +446,11 @@ func (sr *scrutinyRepository) GetSummary(ctx context.Context) (map[string]*model return summaries, nil } + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Process Thresholds +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +func (sr *scrutinyRepository) ProcessSmartAttributeThresholds() { + +} diff --git a/webapp/backend/pkg/models/measurements/smart.go b/webapp/backend/pkg/models/measurements/smart.go index 20524ff..315bd11 100644 --- a/webapp/backend/pkg/models/measurements/smart.go +++ b/webapp/backend/pkg/models/measurements/smart.go @@ -3,8 +3,8 @@ package measurements import ( "fmt" "github.com/analogj/scrutiny/webapp/backend/pkg" - "github.com/analogj/scrutiny/webapp/backend/pkg/metadata" "github.com/analogj/scrutiny/webapp/backend/pkg/models/collector" + "github.com/analogj/scrutiny/webapp/backend/pkg/thresholds" "log" "strings" "time" @@ -22,6 +22,9 @@ type Smart struct { //Attributes (fields) Attributes map[string]SmartAttribute `json:"attrs"` + + //status + Status pkg.DeviceStatus } func (sm *Smart) Flatten() (tags map[string]string, fields map[string]interface{}) { @@ -133,6 +136,7 @@ func (sm *Smart) FromCollectorSmartInfo(wwn string, info collector.SmartInfo) er //generate SmartAtaAttribute entries from Scrutiny Collector Smart data. func (sm *Smart) ProcessAtaSmartInfo(info collector.SmartInfo) { + sm.Status = pkg.DeviceStatusPassed for _, collectorAttr := range info.AtaSmartAttributes.Table { attrModel := SmartAtaAttribute{ AttributeId: collectorAttr.ID, @@ -146,53 +150,72 @@ func (sm *Smart) ProcessAtaSmartInfo(info collector.SmartInfo) { } //now that we've parsed the data from the smartctl response, lets match it against our metadata rules and add additional Scrutiny specific data. - if smartMetadata, ok := metadata.AtaMetadata[collectorAttr.ID]; ok { + if smartMetadata, ok := thresholds.AtaMetadata[collectorAttr.ID]; ok { attrModel.Name = smartMetadata.DisplayName if smartMetadata.Transform != nil { attrModel.TransformedValue = smartMetadata.Transform(attrModel.Value, attrModel.RawValue, attrModel.RawString) } } + attrModel.PopulateAttributeStatus() sm.Attributes[string(collectorAttr.ID)] = &attrModel + if attrModel.Status == pkg.SmartAttributeStatusFailed { + sm.Status = pkg.DeviceStatusFailedScrutiny + } } } //generate SmartNvmeAttribute entries from Scrutiny Collector Smart data. func (sm *Smart) ProcessNvmeSmartInfo(info collector.SmartInfo) { + sm.Attributes = map[string]SmartAttribute{ - "critical_warning": &SmartNvmeAttribute{AttributeId: "critical_warning", Name: "Critical Warning", Value: info.NvmeSmartHealthInformationLog.CriticalWarning, Threshold: 0}, - "temperature": &SmartNvmeAttribute{AttributeId: "temperature", Name: "Temperature", Value: info.NvmeSmartHealthInformationLog.Temperature, Threshold: -1}, - "available_spare": &SmartNvmeAttribute{AttributeId: "available_spare", Name: "Available Spare", Value: info.NvmeSmartHealthInformationLog.AvailableSpare, Threshold: info.NvmeSmartHealthInformationLog.AvailableSpareThreshold}, - "percentage_used": &SmartNvmeAttribute{AttributeId: "percentage_used", Name: "Percentage Used", Value: info.NvmeSmartHealthInformationLog.PercentageUsed, Threshold: 100}, - "data_units_read": &SmartNvmeAttribute{AttributeId: "data_units_read", Name: "Data Units Read", Value: info.NvmeSmartHealthInformationLog.DataUnitsRead, Threshold: -1}, - "data_units_written": &SmartNvmeAttribute{AttributeId: "data_units_written", Name: "Data Units Written", Value: info.NvmeSmartHealthInformationLog.DataUnitsWritten, Threshold: -1}, - "host_reads": &SmartNvmeAttribute{AttributeId: "host_reads", Name: "Host Reads", Value: info.NvmeSmartHealthInformationLog.HostReads, Threshold: -1}, - "host_writes": &SmartNvmeAttribute{AttributeId: "host_writes", Name: "Host Writes", Value: info.NvmeSmartHealthInformationLog.HostWrites, Threshold: -1}, - "controller_busy_time": &SmartNvmeAttribute{AttributeId: "controller_busy_time", Name: "Controller Busy Time", Value: info.NvmeSmartHealthInformationLog.ControllerBusyTime, Threshold: -1}, - "power_cycles": &SmartNvmeAttribute{AttributeId: "power_cycles", Name: "Power Cycles", Value: info.NvmeSmartHealthInformationLog.PowerCycles, Threshold: -1}, - "power_on_hours": &SmartNvmeAttribute{AttributeId: "power_on_hours", Name: "Power on Hours", Value: info.NvmeSmartHealthInformationLog.PowerOnHours, Threshold: -1}, - "unsafe_shutdowns": &SmartNvmeAttribute{AttributeId: "unsafe_shutdowns", Name: "Unsafe Shutdowns", Value: info.NvmeSmartHealthInformationLog.UnsafeShutdowns, Threshold: -1}, - "media_errors": &SmartNvmeAttribute{AttributeId: "media_errors", Name: "Media Errors", Value: info.NvmeSmartHealthInformationLog.MediaErrors, Threshold: 0}, - "num_err_log_entries": &SmartNvmeAttribute{AttributeId: "num_err_log_entries", Name: "Numb Err Log Entries", Value: info.NvmeSmartHealthInformationLog.NumErrLogEntries, Threshold: 0}, - "warning_temp_time": &SmartNvmeAttribute{AttributeId: "warning_temp_time", Name: "Warning Temp Time", Value: info.NvmeSmartHealthInformationLog.WarningTempTime, Threshold: -1}, - "critical_comp_time": &SmartNvmeAttribute{AttributeId: "critical_comp_time", Name: "Critical CompTime", Value: info.NvmeSmartHealthInformationLog.CriticalCompTime, Threshold: -1}, + "critical_warning": (&SmartNvmeAttribute{AttributeId: "critical_warning", Name: "Critical Warning", Value: info.NvmeSmartHealthInformationLog.CriticalWarning, Threshold: 0}).PopulateAttributeStatus(), + "temperature": (&SmartNvmeAttribute{AttributeId: "temperature", Name: "Temperature", Value: info.NvmeSmartHealthInformationLog.Temperature, Threshold: -1}).PopulateAttributeStatus(), + "available_spare": (&SmartNvmeAttribute{AttributeId: "available_spare", Name: "Available Spare", Value: info.NvmeSmartHealthInformationLog.AvailableSpare, Threshold: info.NvmeSmartHealthInformationLog.AvailableSpareThreshold}).PopulateAttributeStatus(), + "percentage_used": (&SmartNvmeAttribute{AttributeId: "percentage_used", Name: "Percentage Used", Value: info.NvmeSmartHealthInformationLog.PercentageUsed, Threshold: 100}).PopulateAttributeStatus(), + "data_units_read": (&SmartNvmeAttribute{AttributeId: "data_units_read", Name: "Data Units Read", Value: info.NvmeSmartHealthInformationLog.DataUnitsRead, Threshold: -1}).PopulateAttributeStatus(), + "data_units_written": (&SmartNvmeAttribute{AttributeId: "data_units_written", Name: "Data Units Written", Value: info.NvmeSmartHealthInformationLog.DataUnitsWritten, Threshold: -1}).PopulateAttributeStatus(), + "host_reads": (&SmartNvmeAttribute{AttributeId: "host_reads", Name: "Host Reads", Value: info.NvmeSmartHealthInformationLog.HostReads, Threshold: -1}).PopulateAttributeStatus(), + "host_writes": (&SmartNvmeAttribute{AttributeId: "host_writes", Name: "Host Writes", Value: info.NvmeSmartHealthInformationLog.HostWrites, Threshold: -1}).PopulateAttributeStatus(), + "controller_busy_time": (&SmartNvmeAttribute{AttributeId: "controller_busy_time", Name: "Controller Busy Time", Value: info.NvmeSmartHealthInformationLog.ControllerBusyTime, Threshold: -1}).PopulateAttributeStatus(), + "power_cycles": (&SmartNvmeAttribute{AttributeId: "power_cycles", Name: "Power Cycles", Value: info.NvmeSmartHealthInformationLog.PowerCycles, Threshold: -1}).PopulateAttributeStatus(), + "power_on_hours": (&SmartNvmeAttribute{AttributeId: "power_on_hours", Name: "Power on Hours", Value: info.NvmeSmartHealthInformationLog.PowerOnHours, Threshold: -1}).PopulateAttributeStatus(), + "unsafe_shutdowns": (&SmartNvmeAttribute{AttributeId: "unsafe_shutdowns", Name: "Unsafe Shutdowns", Value: info.NvmeSmartHealthInformationLog.UnsafeShutdowns, Threshold: -1}).PopulateAttributeStatus(), + "media_errors": (&SmartNvmeAttribute{AttributeId: "media_errors", Name: "Media Errors", Value: info.NvmeSmartHealthInformationLog.MediaErrors, Threshold: 0}).PopulateAttributeStatus(), + "num_err_log_entries": (&SmartNvmeAttribute{AttributeId: "num_err_log_entries", Name: "Numb Err Log Entries", Value: info.NvmeSmartHealthInformationLog.NumErrLogEntries, Threshold: 0}).PopulateAttributeStatus(), + "warning_temp_time": (&SmartNvmeAttribute{AttributeId: "warning_temp_time", Name: "Warning Temp Time", Value: info.NvmeSmartHealthInformationLog.WarningTempTime, Threshold: -1}).PopulateAttributeStatus(), + "critical_comp_time": (&SmartNvmeAttribute{AttributeId: "critical_comp_time", Name: "Critical CompTime", Value: info.NvmeSmartHealthInformationLog.CriticalCompTime, Threshold: -1}).PopulateAttributeStatus(), + } + + //find analyzed attribute status + for _, val := range sm.Attributes { + if val.GetStatus() == pkg.SmartAttributeStatusFailed { + sm.Status = pkg.DeviceStatusFailedScrutiny + } } } //generate SmartScsiAttribute entries from Scrutiny Collector Smart data. func (sm *Smart) ProcessScsiSmartInfo(info collector.SmartInfo) { sm.Attributes = map[string]SmartAttribute{ - "scsi_grown_defect_list": &SmartScsiAttribute{AttributeId: "scsi_grown_defect_list", Name: "Grown Defect List", Value: info.ScsiGrownDefectList, Threshold: 0}, - "read_errors_corrected_by_eccfast": &SmartScsiAttribute{AttributeId: "read_errors_corrected_by_eccfast", Name: "Read Errors Corrected by ECC Fast", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByEccfast, Threshold: -1}, - "read_errors_corrected_by_eccdelayed": &SmartScsiAttribute{AttributeId: "read_errors_corrected_by_eccdelayed", Name: "Read Errors Corrected by ECC Delayed", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByEccdelayed, Threshold: -1}, - "read_errors_corrected_by_rereads_rewrites": &SmartScsiAttribute{AttributeId: "read_errors_corrected_by_rereads_rewrites", Name: "Read Errors Corrected by ReReads/ReWrites", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByRereadsRewrites, Threshold: 0}, - "read_total_errors_corrected": &SmartScsiAttribute{AttributeId: "read_total_errors_corrected", Name: "Read Total Errors Corrected", Value: info.ScsiErrorCounterLog.Read.TotalErrorsCorrected, Threshold: -1}, - "read_correction_algorithm_invocations": &SmartScsiAttribute{AttributeId: "read_correction_algorithm_invocations", Name: "Read Correction Algorithm Invocations", Value: info.ScsiErrorCounterLog.Read.CorrectionAlgorithmInvocations, Threshold: -1}, - "read_total_uncorrected_errors": &SmartScsiAttribute{AttributeId: "read_total_uncorrected_errors", Name: "Read Total Uncorrected Errors", Value: info.ScsiErrorCounterLog.Read.TotalUncorrectedErrors, Threshold: 0}, - "write_errors_corrected_by_eccfast": &SmartScsiAttribute{AttributeId: "write_errors_corrected_by_eccfast", Name: "Write Errors Corrected by ECC Fast", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByEccfast, Threshold: -1}, - "write_errors_corrected_by_eccdelayed": &SmartScsiAttribute{AttributeId: "write_errors_corrected_by_eccdelayed", Name: "Write Errors Corrected by ECC Delayed", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByEccdelayed, Threshold: -1}, - "write_errors_corrected_by_rereads_rewrites": &SmartScsiAttribute{AttributeId: "write_errors_corrected_by_rereads_rewrites", Name: "Write Errors Corrected by ReReads/ReWrites", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByRereadsRewrites, Threshold: 0}, - "write_total_errors_corrected": &SmartScsiAttribute{AttributeId: "write_total_errors_corrected", Name: "Write Total Errors Corrected", Value: info.ScsiErrorCounterLog.Write.TotalErrorsCorrected, Threshold: -1}, - "write_correction_algorithm_invocations": &SmartScsiAttribute{AttributeId: "write_correction_algorithm_invocations", Name: "Write Correction Algorithm Invocations", Value: info.ScsiErrorCounterLog.Write.CorrectionAlgorithmInvocations, Threshold: -1}, - "write_total_uncorrected_errors": &SmartScsiAttribute{AttributeId: "write_total_uncorrected_errors", Name: "Write Total Uncorrected Errors", Value: info.ScsiErrorCounterLog.Write.TotalUncorrectedErrors, Threshold: 0}, + "scsi_grown_defect_list": (&SmartScsiAttribute{AttributeId: "scsi_grown_defect_list", Name: "Grown Defect List", Value: info.ScsiGrownDefectList, Threshold: 0}).PopulateAttributeStatus(), + "read_errors_corrected_by_eccfast": (&SmartScsiAttribute{AttributeId: "read_errors_corrected_by_eccfast", Name: "Read Errors Corrected by ECC Fast", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByEccfast, Threshold: -1}).PopulateAttributeStatus(), + "read_errors_corrected_by_eccdelayed": (&SmartScsiAttribute{AttributeId: "read_errors_corrected_by_eccdelayed", Name: "Read Errors Corrected by ECC Delayed", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByEccdelayed, Threshold: -1}).PopulateAttributeStatus(), + "read_errors_corrected_by_rereads_rewrites": (&SmartScsiAttribute{AttributeId: "read_errors_corrected_by_rereads_rewrites", Name: "Read Errors Corrected by ReReads/ReWrites", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByRereadsRewrites, Threshold: 0}).PopulateAttributeStatus(), + "read_total_errors_corrected": (&SmartScsiAttribute{AttributeId: "read_total_errors_corrected", Name: "Read Total Errors Corrected", Value: info.ScsiErrorCounterLog.Read.TotalErrorsCorrected, Threshold: -1}).PopulateAttributeStatus(), + "read_correction_algorithm_invocations": (&SmartScsiAttribute{AttributeId: "read_correction_algorithm_invocations", Name: "Read Correction Algorithm Invocations", Value: info.ScsiErrorCounterLog.Read.CorrectionAlgorithmInvocations, Threshold: -1}).PopulateAttributeStatus(), + "read_total_uncorrected_errors": (&SmartScsiAttribute{AttributeId: "read_total_uncorrected_errors", Name: "Read Total Uncorrected Errors", Value: info.ScsiErrorCounterLog.Read.TotalUncorrectedErrors, Threshold: 0}).PopulateAttributeStatus(), + "write_errors_corrected_by_eccfast": (&SmartScsiAttribute{AttributeId: "write_errors_corrected_by_eccfast", Name: "Write Errors Corrected by ECC Fast", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByEccfast, Threshold: -1}).PopulateAttributeStatus(), + "write_errors_corrected_by_eccdelayed": (&SmartScsiAttribute{AttributeId: "write_errors_corrected_by_eccdelayed", Name: "Write Errors Corrected by ECC Delayed", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByEccdelayed, Threshold: -1}).PopulateAttributeStatus(), + "write_errors_corrected_by_rereads_rewrites": (&SmartScsiAttribute{AttributeId: "write_errors_corrected_by_rereads_rewrites", Name: "Write Errors Corrected by ReReads/ReWrites", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByRereadsRewrites, Threshold: 0}).PopulateAttributeStatus(), + "write_total_errors_corrected": (&SmartScsiAttribute{AttributeId: "write_total_errors_corrected", Name: "Write Total Errors Corrected", Value: info.ScsiErrorCounterLog.Write.TotalErrorsCorrected, Threshold: -1}).PopulateAttributeStatus(), + "write_correction_algorithm_invocations": (&SmartScsiAttribute{AttributeId: "write_correction_algorithm_invocations", Name: "Write Correction Algorithm Invocations", Value: info.ScsiErrorCounterLog.Write.CorrectionAlgorithmInvocations, Threshold: -1}).PopulateAttributeStatus(), + "write_total_uncorrected_errors": (&SmartScsiAttribute{AttributeId: "write_total_uncorrected_errors", Name: "Write Total Uncorrected Errors", Value: info.ScsiErrorCounterLog.Write.TotalUncorrectedErrors, Threshold: 0}).PopulateAttributeStatus(), + } + + //find analyzed attribute status + for _, val := range sm.Attributes { + if val.GetStatus() == pkg.SmartAttributeStatusFailed { + sm.Status = pkg.DeviceStatusFailedScrutiny + } } } diff --git a/webapp/backend/pkg/models/measurements/smart_ata_attribute.go b/webapp/backend/pkg/models/measurements/smart_ata_attribute.go index 691cac7..6481b4f 100644 --- a/webapp/backend/pkg/models/measurements/smart_ata_attribute.go +++ b/webapp/backend/pkg/models/measurements/smart_ata_attribute.go @@ -2,14 +2,12 @@ package measurements import ( "fmt" + "github.com/analogj/scrutiny/webapp/backend/pkg" + "github.com/analogj/scrutiny/webapp/backend/pkg/thresholds" "strconv" "strings" ) -const SmartAttributeStatusPassed = "passed" -const SmartAttributeStatusFailed = "failed" -const SmartAttributeStatusWarning = "warn" - type SmartAtaAttribute struct { AttributeId int `json:"attribute_id"` Name string `json:"name"` @@ -27,6 +25,10 @@ type SmartAtaAttribute struct { FailureRate float64 `json:"failure_rate,omitempty"` } +func (sa *SmartAtaAttribute) GetStatus() string { + return sa.Status +} + func (sa *SmartAtaAttribute) Flatten() map[string]interface{} { idString := strconv.Itoa(sa.AttributeId) @@ -71,81 +73,82 @@ func (sa *SmartAtaAttribute) Inflate(key string, val interface{}) { } } -// -////populate attribute status, using SMART Thresholds & Observed Metadata -//func (sa *SmartAtaAttribute) PopulateAttributeStatus() { -// if strings.ToUpper(sa.WhenFailed) == SmartWhenFailedFailingNow { -// //this attribute has previously failed -// sa.Status = SmartAttributeStatusFailed -// sa.StatusReason = "Attribute is failing manufacturer SMART threshold" -// -// } else if strings.ToUpper(sa.WhenFailed) == SmartWhenFailedInThePast { -// sa.Status = SmartAttributeStatusWarning -// sa.StatusReason = "Attribute has previously failed manufacturer SMART threshold" -// } -// -// if smartMetadata, ok := metadata.AtaMetadata[sa.AttributeId]; ok { -// sa.MetadataObservedThresholdStatus(smartMetadata) -// } -// -// //check if status is blank, set to "passed" -// if len(sa.Status) == 0 { -// sa.Status = SmartAttributeStatusPassed -// } -//} -// -//// compare the attribute (raw, normalized, transformed) value to observed thresholds, and update status if necessary -//func (sa *SmartAtaAttribute) MetadataObservedThresholdStatus(smartMetadata metadata.AtaAttributeMetadata) { -// //TODO: multiple rules -// // try to predict the failure rates for observed thresholds that have 0 failure rate and error bars. -// // - if the attribute is critical -// // - the failure rate is over 10 - set to failed -// // - the attribute does not match any threshold, set to warn -// // - if the attribute is not critical -// // - if failure rate is above 20 - set to failed -// // - if failure rate is above 10 but below 20 - set to warn -// -// //update the smart attribute status based on Observed thresholds. -// var value int64 -// if smartMetadata.DisplayType == metadata.AtaSmartAttributeDisplayTypeNormalized { -// value = int64(sa.Value) -// } else if smartMetadata.DisplayType == metadata.AtaSmartAttributeDisplayTypeTransformed { -// value = sa.TransformedValue -// } else { -// value = sa.RawValue -// } -// -// for _, obsThresh := range smartMetadata.ObservedThresholds { -// -// //check if "value" is in this bucket -// if ((obsThresh.Low == obsThresh.High) && value == obsThresh.Low) || -// (obsThresh.Low < value && value <= obsThresh.High) { -// sa.FailureRate = obsThresh.AnnualFailureRate -// -// if smartMetadata.Critical { -// if obsThresh.AnnualFailureRate >= 0.10 { -// sa.Status = SmartAttributeStatusFailed -// sa.StatusReason = "Observed Failure Rate for Critical Attribute is greater than 10%" -// } -// } else { -// if obsThresh.AnnualFailureRate >= 0.20 { -// sa.Status = SmartAttributeStatusFailed -// sa.StatusReason = "Observed Failure Rate for Attribute is greater than 20%" -// } else if obsThresh.AnnualFailureRate >= 0.10 { -// sa.Status = SmartAttributeStatusWarning -// sa.StatusReason = "Observed Failure Rate for Attribute is greater than 10%" -// } -// } -// -// //we've found the correct bucket, we can drop out of this loop -// return -// } -// } -// // no bucket found -// if smartMetadata.Critical { -// sa.Status = SmartAttributeStatusWarning -// sa.StatusReason = "Could not determine Observed Failure Rate for Critical Attribute" -// } -// -// return -//} +//populate attribute status, using SMART Thresholds & Observed Metadata +// Chainable +func (sa *SmartAtaAttribute) PopulateAttributeStatus() *SmartAtaAttribute { + if strings.ToUpper(sa.WhenFailed) == pkg.SmartWhenFailedFailingNow { + //this attribute has previously failed + sa.Status = pkg.SmartAttributeStatusFailed + sa.StatusReason = "Attribute is failing manufacturer SMART threshold" + + } else if strings.ToUpper(sa.WhenFailed) == pkg.SmartWhenFailedInThePast { + sa.Status = pkg.SmartAttributeStatusWarning + sa.StatusReason = "Attribute has previously failed manufacturer SMART threshold" + } + + if smartMetadata, ok := thresholds.AtaMetadata[sa.AttributeId]; ok { + sa.ValidateThreshold(smartMetadata) + } + + //check if status is blank, set to "passed" + if len(sa.Status) == 0 { + sa.Status = pkg.SmartAttributeStatusPassed + } + return sa +} + +// compare the attribute (raw, normalized, transformed) value to observed thresholds, and update status if necessary +func (sa *SmartAtaAttribute) ValidateThreshold(smartMetadata thresholds.AtaAttributeMetadata) { + //TODO: multiple rules + // try to predict the failure rates for observed thresholds that have 0 failure rate and error bars. + // - if the attribute is critical + // - the failure rate is over 10 - set to failed + // - the attribute does not match any threshold, set to warn + // - if the attribute is not critical + // - if failure rate is above 20 - set to failed + // - if failure rate is above 10 but below 20 - set to warn + + //update the smart attribute status based on Observed thresholds. + var value int64 + if smartMetadata.DisplayType == thresholds.AtaSmartAttributeDisplayTypeNormalized { + value = int64(sa.Value) + } else if smartMetadata.DisplayType == thresholds.AtaSmartAttributeDisplayTypeTransformed { + value = sa.TransformedValue + } else { + value = sa.RawValue + } + + for _, obsThresh := range smartMetadata.ObservedThresholds { + + //check if "value" is in this bucket + if ((obsThresh.Low == obsThresh.High) && value == obsThresh.Low) || + (obsThresh.Low < value && value <= obsThresh.High) { + sa.FailureRate = obsThresh.AnnualFailureRate + + if smartMetadata.Critical { + if obsThresh.AnnualFailureRate >= 0.10 { + sa.Status = pkg.SmartAttributeStatusFailed + sa.StatusReason = "Observed Failure Rate for Critical Attribute is greater than 10%" + } + } else { + if obsThresh.AnnualFailureRate >= 0.20 { + sa.Status = pkg.SmartAttributeStatusFailed + sa.StatusReason = "Observed Failure Rate for Attribute is greater than 20%" + } else if obsThresh.AnnualFailureRate >= 0.10 { + sa.Status = pkg.SmartAttributeStatusWarning + sa.StatusReason = "Observed Failure Rate for Attribute is greater than 10%" + } + } + + //we've found the correct bucket, we can drop out of this loop + return + } + } + // no bucket found + if smartMetadata.Critical { + sa.Status = pkg.SmartAttributeStatusWarning + sa.StatusReason = "Could not determine Observed Failure Rate for Critical Attribute" + } + + return +} diff --git a/webapp/backend/pkg/models/measurements/smart_attribute.go b/webapp/backend/pkg/models/measurements/smart_attribute.go index 1d93bc8..a8aaea6 100644 --- a/webapp/backend/pkg/models/measurements/smart_attribute.go +++ b/webapp/backend/pkg/models/measurements/smart_attribute.go @@ -3,4 +3,5 @@ package measurements type SmartAttribute interface { Flatten() (fields map[string]interface{}) Inflate(key string, val interface{}) + GetStatus() string } diff --git a/webapp/backend/pkg/models/measurements/smart_nvme_attribute.go b/webapp/backend/pkg/models/measurements/smart_nvme_attribute.go index 2705ea9..80f1687 100644 --- a/webapp/backend/pkg/models/measurements/smart_nvme_attribute.go +++ b/webapp/backend/pkg/models/measurements/smart_nvme_attribute.go @@ -2,6 +2,8 @@ package measurements import ( "fmt" + "github.com/analogj/scrutiny/webapp/backend/pkg" + "github.com/analogj/scrutiny/webapp/backend/pkg/thresholds" "strings" ) @@ -17,6 +19,10 @@ type SmartNvmeAttribute struct { FailureRate float64 `json:"failure_rate,omitempty"` } +func (sa *SmartNvmeAttribute) GetStatus() string { + return sa.Status +} + func (sa *SmartNvmeAttribute) Flatten() map[string]interface{} { return map[string]interface{}{ fmt.Sprintf("attr.%s.attribute_id", sa.AttributeId): sa.AttributeId, @@ -44,25 +50,26 @@ func (sa *SmartNvmeAttribute) Inflate(key string, val interface{}) { } } -// -////populate attribute status, using SMART Thresholds & Observed Metadata -//func (sa *SmartNvmeAttribute) PopulateAttributeStatus() { -// -// //-1 is a special number meaning no threshold. -// if sa.Threshold != -1 { -// if smartMetadata, ok := metadata.NmveMetadata[sa.AttributeId]; ok { -// //check what the ideal is. Ideal tells us if we our recorded value needs to be above, or below the threshold -// if (smartMetadata.Ideal == "low" && sa.Value > sa.Threshold) || -// (smartMetadata.Ideal == "high" && sa.Value < sa.Threshold) { -// sa.Status = SmartAttributeStatusFailed -// sa.StatusReason = "Attribute is failing recommended SMART threshold" -// } -// } -// } -// //TODO: eventually figure out the critical_warning bits and determine correct error messages here. -// -// //check if status is blank, set to "passed" -// if len(sa.Status) == 0 { -// sa.Status = SmartAttributeStatusPassed -// } -//} +//populate attribute status, using SMART Thresholds & Observed Metadata +// Chainable +func (sa *SmartNvmeAttribute) PopulateAttributeStatus() *SmartNvmeAttribute { + + //-1 is a special number meaning no threshold. + if sa.Threshold != -1 { + if smartMetadata, ok := thresholds.NmveMetadata[sa.AttributeId]; ok { + //check what the ideal is. Ideal tells us if we our recorded value needs to be above, or below the threshold + if (smartMetadata.Ideal == "low" && sa.Value > sa.Threshold) || + (smartMetadata.Ideal == "high" && sa.Value < sa.Threshold) { + sa.Status = pkg.SmartAttributeStatusFailed + sa.StatusReason = "Attribute is failing recommended SMART threshold" + } + } + } + //TODO: eventually figure out the critical_warning bits and determine correct error messages here. + + //check if status is blank, set to "passed" + if len(sa.Status) == 0 { + sa.Status = pkg.SmartAttributeStatusPassed + } + return sa +} diff --git a/webapp/backend/pkg/models/measurements/smart_scsci_attribute.go b/webapp/backend/pkg/models/measurements/smart_scsci_attribute.go index 830036c..d692cef 100644 --- a/webapp/backend/pkg/models/measurements/smart_scsci_attribute.go +++ b/webapp/backend/pkg/models/measurements/smart_scsci_attribute.go @@ -2,6 +2,8 @@ package measurements import ( "fmt" + "github.com/analogj/scrutiny/webapp/backend/pkg" + "github.com/analogj/scrutiny/webapp/backend/pkg/thresholds" "strings" ) @@ -17,6 +19,10 @@ type SmartScsiAttribute struct { FailureRate float64 `json:"failure_rate,omitempty"` } +func (sa *SmartScsiAttribute) GetStatus() string { + return sa.Status +} + func (sa *SmartScsiAttribute) Flatten() map[string]interface{} { return map[string]interface{}{ fmt.Sprintf("attr.%s.attribute_id", sa.AttributeId): sa.AttributeId, @@ -45,23 +51,25 @@ func (sa *SmartScsiAttribute) Inflate(key string, val interface{}) { } // -////populate attribute status, using SMART Thresholds & Observed Metadata -//func (sa *SmartScsiAttribute) PopulateAttributeStatus() { -// -// //-1 is a special number meaning no threshold. -// if sa.Threshold != -1 { -// if smartMetadata, ok := metadata.NmveMetadata[sa.AttributeId]; ok { -// //check what the ideal is. Ideal tells us if we our recorded value needs to be above, or below the threshold -// if (smartMetadata.Ideal == "low" && sa.Value > sa.Threshold) || -// (smartMetadata.Ideal == "high" && sa.Value < sa.Threshold) { -// sa.Status = SmartAttributeStatusFailed -// sa.StatusReason = "Attribute is failing recommended SMART threshold" -// } -// } -// } -// -// //check if status is blank, set to "passed" -// if len(sa.Status) == 0 { -// sa.Status = SmartAttributeStatusPassed -// } -//} +//populate attribute status, using SMART Thresholds & Observed Metadata +//Chainable +func (sa *SmartScsiAttribute) PopulateAttributeStatus() *SmartScsiAttribute { + + //-1 is a special number meaning no threshold. + if sa.Threshold != -1 { + if smartMetadata, ok := thresholds.NmveMetadata[sa.AttributeId]; ok { + //check what the ideal is. Ideal tells us if we our recorded value needs to be above, or below the threshold + if (smartMetadata.Ideal == "low" && sa.Value > sa.Threshold) || + (smartMetadata.Ideal == "high" && sa.Value < sa.Threshold) { + sa.Status = pkg.SmartAttributeStatusFailed + sa.StatusReason = "Attribute is failing recommended SMART threshold" + } + } + } + + //check if status is blank, set to "passed" + if len(sa.Status) == 0 { + sa.Status = pkg.SmartAttributeStatusPassed + } + return sa +} diff --git a/webapp/backend/pkg/metadata/ata_attribute_metadata.go b/webapp/backend/pkg/thresholds/ata_attribute_metadata.go similarity index 99% rename from webapp/backend/pkg/metadata/ata_attribute_metadata.go rename to webapp/backend/pkg/thresholds/ata_attribute_metadata.go index 2f4c6c8..0628a18 100644 --- a/webapp/backend/pkg/metadata/ata_attribute_metadata.go +++ b/webapp/backend/pkg/thresholds/ata_attribute_metadata.go @@ -1,4 +1,4 @@ -package metadata +package thresholds const AtaSmartAttributeDisplayTypeRaw = "raw" const AtaSmartAttributeDisplayTypeNormalized = "normalized" diff --git a/webapp/backend/pkg/metadata/nvme_attribute_metadata.go b/webapp/backend/pkg/thresholds/nvme_attribute_metadata.go similarity index 99% rename from webapp/backend/pkg/metadata/nvme_attribute_metadata.go rename to webapp/backend/pkg/thresholds/nvme_attribute_metadata.go index 80efcfa..8ac4c8d 100644 --- a/webapp/backend/pkg/metadata/nvme_attribute_metadata.go +++ b/webapp/backend/pkg/thresholds/nvme_attribute_metadata.go @@ -1,4 +1,4 @@ -package metadata +package thresholds // https://media.kingston.com/support/downloads/MKP_521.6_SMART-DCP1000_attribute.pdf // https://www.percona.com/blog/2017/02/09/using-nvme-command-line-tools-to-check-nvme-flash-health/ diff --git a/webapp/backend/pkg/metadata/scsi_attribute_metadata.go b/webapp/backend/pkg/thresholds/scsi_attribute_metadata.go similarity index 99% rename from webapp/backend/pkg/metadata/scsi_attribute_metadata.go rename to webapp/backend/pkg/thresholds/scsi_attribute_metadata.go index cd4f974..51cbbaa 100644 --- a/webapp/backend/pkg/metadata/scsi_attribute_metadata.go +++ b/webapp/backend/pkg/thresholds/scsi_attribute_metadata.go @@ -1,4 +1,4 @@ -package metadata +package thresholds type ScsiAttributeMetadata struct { ID string `json:"-"` diff --git a/webapp/backend/pkg/web/handler/get_device_details.go b/webapp/backend/pkg/web/handler/get_device_details.go index 5807292..d938c88 100644 --- a/webapp/backend/pkg/web/handler/get_device_details.go +++ b/webapp/backend/pkg/web/handler/get_device_details.go @@ -2,7 +2,7 @@ package handler import ( "github.com/analogj/scrutiny/webapp/backend/pkg/database" - "github.com/analogj/scrutiny/webapp/backend/pkg/metadata" + "github.com/analogj/scrutiny/webapp/backend/pkg/thresholds" "github.com/gin-gonic/gin" "github.com/sirupsen/logrus" "net/http" @@ -23,11 +23,11 @@ func GetDeviceDetails(c *gin.Context) { var deviceMetadata interface{} if device.IsAta() { - deviceMetadata = metadata.AtaMetadata + deviceMetadata = thresholds.AtaMetadata } else if device.IsNvme() { - deviceMetadata = metadata.NmveMetadata + deviceMetadata = thresholds.NmveMetadata } else if device.IsScsi() { - deviceMetadata = metadata.ScsiMetadata + deviceMetadata = thresholds.ScsiMetadata } c.JSON(http.StatusOK, gin.H{"success": true, "data": map[string]interface{}{"device": device, "smart_results": smartResults}, "metadata": deviceMetadata}) diff --git a/webapp/backend/pkg/web/handler/upload_device_metrics.go b/webapp/backend/pkg/web/handler/upload_device_metrics.go index 5abd134..9fab5b5 100644 --- a/webapp/backend/pkg/web/handler/upload_device_metrics.go +++ b/webapp/backend/pkg/web/handler/upload_device_metrics.go @@ -37,13 +37,23 @@ func UploadDeviceMetrics(c *gin.Context) { } // insert smart info - _, err = deviceRepo.SaveSmartAttributes(c, c.Param("wwn"), collectorSmartData) + smartData, err := deviceRepo.SaveSmartAttributes(c, c.Param("wwn"), collectorSmartData) if err != nil { logger.Errorln("An error occurred while saving smartctl metrics", err) c.JSON(http.StatusInternalServerError, gin.H{"success": false}) return } + if smartData.Status != pkg.DeviceStatusPassed { + //there is a failure detected by Scrutiny, update the device status on the homepage. + updatedDevice, err = deviceRepo.UpdateDeviceStatus(c, c.Param("wwn"), smartData.Status) + if err != nil { + logger.Errorln("An error occurred while updating device status", err) + c.JSON(http.StatusInternalServerError, gin.H{"success": false}) + return + } + } + // save smart temperature data (ignore failures) err = deviceRepo.SaveSmartTemperature(c, c.Param("wwn"), updatedDevice.DeviceProtocol, collectorSmartData) if err != nil {