validate thresholds whenever SMART data is recieved.

pull/228/head
Jason Kulatunga 3 years ago
parent 1fc910f41b
commit 80f4660130

@ -2,6 +2,7 @@ package database
import (
"context"
"github.com/analogj/scrutiny/webapp/backend/pkg"
"github.com/analogj/scrutiny/webapp/backend/pkg/models"
"github.com/analogj/scrutiny/webapp/backend/pkg/models/collector"
"github.com/analogj/scrutiny/webapp/backend/pkg/models/measurements"
@ -16,6 +17,7 @@ type DeviceRepo interface {
RegisterDevice(ctx context.Context, dev models.Device) error
GetDevices(ctx context.Context) ([]models.Device, error)
UpdateDevice(ctx context.Context, wwn string, collectorSmartData collector.SmartInfo) (models.Device, error)
UpdateDeviceStatus(ctx context.Context, wwn string, status pkg.DeviceStatus) (models.Device, error)
GetDeviceDetails(ctx context.Context, wwn string) (models.Device, error)
SaveSmartAttributes(ctx context.Context, wwn string, collectorSmartData collector.SmartInfo) (measurements.Smart, error)

@ -3,6 +3,7 @@ package database
import (
"context"
"fmt"
"github.com/analogj/scrutiny/webapp/backend/pkg"
"github.com/analogj/scrutiny/webapp/backend/pkg/config"
"github.com/analogj/scrutiny/webapp/backend/pkg/models"
"github.com/analogj/scrutiny/webapp/backend/pkg/models/collector"
@ -163,6 +164,17 @@ func (sr *scrutinyRepository) UpdateDevice(ctx context.Context, wwn string, coll
return device, sr.gormClient.Model(&device).Updates(device).Error
}
//Update Device Status
func (sr *scrutinyRepository) UpdateDeviceStatus(ctx context.Context, wwn string, status pkg.DeviceStatus) (models.Device, error) {
var device models.Device
if err := sr.gormClient.WithContext(ctx).Where("wwn = ?", wwn).First(&device).Error; err != nil {
return device, fmt.Errorf("Could not get device from DB", err)
}
device.DeviceStatus = pkg.Set(device.DeviceStatus, status)
return device, sr.gormClient.Model(&device).Updates(device).Error
}
func (sr *scrutinyRepository) GetDeviceDetails(ctx context.Context, wwn string) (models.Device, error) {
var device models.Device
@ -434,3 +446,11 @@ func (sr *scrutinyRepository) GetSummary(ctx context.Context) (map[string]*model
return summaries, nil
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Process Thresholds
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
func (sr *scrutinyRepository) ProcessSmartAttributeThresholds() {
}

@ -3,8 +3,8 @@ package measurements
import (
"fmt"
"github.com/analogj/scrutiny/webapp/backend/pkg"
"github.com/analogj/scrutiny/webapp/backend/pkg/metadata"
"github.com/analogj/scrutiny/webapp/backend/pkg/models/collector"
"github.com/analogj/scrutiny/webapp/backend/pkg/thresholds"
"log"
"strings"
"time"
@ -22,6 +22,9 @@ type Smart struct {
//Attributes (fields)
Attributes map[string]SmartAttribute `json:"attrs"`
//status
Status pkg.DeviceStatus
}
func (sm *Smart) Flatten() (tags map[string]string, fields map[string]interface{}) {
@ -133,6 +136,7 @@ func (sm *Smart) FromCollectorSmartInfo(wwn string, info collector.SmartInfo) er
//generate SmartAtaAttribute entries from Scrutiny Collector Smart data.
func (sm *Smart) ProcessAtaSmartInfo(info collector.SmartInfo) {
sm.Status = pkg.DeviceStatusPassed
for _, collectorAttr := range info.AtaSmartAttributes.Table {
attrModel := SmartAtaAttribute{
AttributeId: collectorAttr.ID,
@ -146,53 +150,72 @@ func (sm *Smart) ProcessAtaSmartInfo(info collector.SmartInfo) {
}
//now that we've parsed the data from the smartctl response, lets match it against our metadata rules and add additional Scrutiny specific data.
if smartMetadata, ok := metadata.AtaMetadata[collectorAttr.ID]; ok {
if smartMetadata, ok := thresholds.AtaMetadata[collectorAttr.ID]; ok {
attrModel.Name = smartMetadata.DisplayName
if smartMetadata.Transform != nil {
attrModel.TransformedValue = smartMetadata.Transform(attrModel.Value, attrModel.RawValue, attrModel.RawString)
}
}
attrModel.PopulateAttributeStatus()
sm.Attributes[string(collectorAttr.ID)] = &attrModel
if attrModel.Status == pkg.SmartAttributeStatusFailed {
sm.Status = pkg.DeviceStatusFailedScrutiny
}
}
}
//generate SmartNvmeAttribute entries from Scrutiny Collector Smart data.
func (sm *Smart) ProcessNvmeSmartInfo(info collector.SmartInfo) {
sm.Attributes = map[string]SmartAttribute{
"critical_warning": &SmartNvmeAttribute{AttributeId: "critical_warning", Name: "Critical Warning", Value: info.NvmeSmartHealthInformationLog.CriticalWarning, Threshold: 0},
"temperature": &SmartNvmeAttribute{AttributeId: "temperature", Name: "Temperature", Value: info.NvmeSmartHealthInformationLog.Temperature, Threshold: -1},
"available_spare": &SmartNvmeAttribute{AttributeId: "available_spare", Name: "Available Spare", Value: info.NvmeSmartHealthInformationLog.AvailableSpare, Threshold: info.NvmeSmartHealthInformationLog.AvailableSpareThreshold},
"percentage_used": &SmartNvmeAttribute{AttributeId: "percentage_used", Name: "Percentage Used", Value: info.NvmeSmartHealthInformationLog.PercentageUsed, Threshold: 100},
"data_units_read": &SmartNvmeAttribute{AttributeId: "data_units_read", Name: "Data Units Read", Value: info.NvmeSmartHealthInformationLog.DataUnitsRead, Threshold: -1},
"data_units_written": &SmartNvmeAttribute{AttributeId: "data_units_written", Name: "Data Units Written", Value: info.NvmeSmartHealthInformationLog.DataUnitsWritten, Threshold: -1},
"host_reads": &SmartNvmeAttribute{AttributeId: "host_reads", Name: "Host Reads", Value: info.NvmeSmartHealthInformationLog.HostReads, Threshold: -1},
"host_writes": &SmartNvmeAttribute{AttributeId: "host_writes", Name: "Host Writes", Value: info.NvmeSmartHealthInformationLog.HostWrites, Threshold: -1},
"controller_busy_time": &SmartNvmeAttribute{AttributeId: "controller_busy_time", Name: "Controller Busy Time", Value: info.NvmeSmartHealthInformationLog.ControllerBusyTime, Threshold: -1},
"power_cycles": &SmartNvmeAttribute{AttributeId: "power_cycles", Name: "Power Cycles", Value: info.NvmeSmartHealthInformationLog.PowerCycles, Threshold: -1},
"power_on_hours": &SmartNvmeAttribute{AttributeId: "power_on_hours", Name: "Power on Hours", Value: info.NvmeSmartHealthInformationLog.PowerOnHours, Threshold: -1},
"unsafe_shutdowns": &SmartNvmeAttribute{AttributeId: "unsafe_shutdowns", Name: "Unsafe Shutdowns", Value: info.NvmeSmartHealthInformationLog.UnsafeShutdowns, Threshold: -1},
"media_errors": &SmartNvmeAttribute{AttributeId: "media_errors", Name: "Media Errors", Value: info.NvmeSmartHealthInformationLog.MediaErrors, Threshold: 0},
"num_err_log_entries": &SmartNvmeAttribute{AttributeId: "num_err_log_entries", Name: "Numb Err Log Entries", Value: info.NvmeSmartHealthInformationLog.NumErrLogEntries, Threshold: 0},
"warning_temp_time": &SmartNvmeAttribute{AttributeId: "warning_temp_time", Name: "Warning Temp Time", Value: info.NvmeSmartHealthInformationLog.WarningTempTime, Threshold: -1},
"critical_comp_time": &SmartNvmeAttribute{AttributeId: "critical_comp_time", Name: "Critical CompTime", Value: info.NvmeSmartHealthInformationLog.CriticalCompTime, Threshold: -1},
"critical_warning": (&SmartNvmeAttribute{AttributeId: "critical_warning", Name: "Critical Warning", Value: info.NvmeSmartHealthInformationLog.CriticalWarning, Threshold: 0}).PopulateAttributeStatus(),
"temperature": (&SmartNvmeAttribute{AttributeId: "temperature", Name: "Temperature", Value: info.NvmeSmartHealthInformationLog.Temperature, Threshold: -1}).PopulateAttributeStatus(),
"available_spare": (&SmartNvmeAttribute{AttributeId: "available_spare", Name: "Available Spare", Value: info.NvmeSmartHealthInformationLog.AvailableSpare, Threshold: info.NvmeSmartHealthInformationLog.AvailableSpareThreshold}).PopulateAttributeStatus(),
"percentage_used": (&SmartNvmeAttribute{AttributeId: "percentage_used", Name: "Percentage Used", Value: info.NvmeSmartHealthInformationLog.PercentageUsed, Threshold: 100}).PopulateAttributeStatus(),
"data_units_read": (&SmartNvmeAttribute{AttributeId: "data_units_read", Name: "Data Units Read", Value: info.NvmeSmartHealthInformationLog.DataUnitsRead, Threshold: -1}).PopulateAttributeStatus(),
"data_units_written": (&SmartNvmeAttribute{AttributeId: "data_units_written", Name: "Data Units Written", Value: info.NvmeSmartHealthInformationLog.DataUnitsWritten, Threshold: -1}).PopulateAttributeStatus(),
"host_reads": (&SmartNvmeAttribute{AttributeId: "host_reads", Name: "Host Reads", Value: info.NvmeSmartHealthInformationLog.HostReads, Threshold: -1}).PopulateAttributeStatus(),
"host_writes": (&SmartNvmeAttribute{AttributeId: "host_writes", Name: "Host Writes", Value: info.NvmeSmartHealthInformationLog.HostWrites, Threshold: -1}).PopulateAttributeStatus(),
"controller_busy_time": (&SmartNvmeAttribute{AttributeId: "controller_busy_time", Name: "Controller Busy Time", Value: info.NvmeSmartHealthInformationLog.ControllerBusyTime, Threshold: -1}).PopulateAttributeStatus(),
"power_cycles": (&SmartNvmeAttribute{AttributeId: "power_cycles", Name: "Power Cycles", Value: info.NvmeSmartHealthInformationLog.PowerCycles, Threshold: -1}).PopulateAttributeStatus(),
"power_on_hours": (&SmartNvmeAttribute{AttributeId: "power_on_hours", Name: "Power on Hours", Value: info.NvmeSmartHealthInformationLog.PowerOnHours, Threshold: -1}).PopulateAttributeStatus(),
"unsafe_shutdowns": (&SmartNvmeAttribute{AttributeId: "unsafe_shutdowns", Name: "Unsafe Shutdowns", Value: info.NvmeSmartHealthInformationLog.UnsafeShutdowns, Threshold: -1}).PopulateAttributeStatus(),
"media_errors": (&SmartNvmeAttribute{AttributeId: "media_errors", Name: "Media Errors", Value: info.NvmeSmartHealthInformationLog.MediaErrors, Threshold: 0}).PopulateAttributeStatus(),
"num_err_log_entries": (&SmartNvmeAttribute{AttributeId: "num_err_log_entries", Name: "Numb Err Log Entries", Value: info.NvmeSmartHealthInformationLog.NumErrLogEntries, Threshold: 0}).PopulateAttributeStatus(),
"warning_temp_time": (&SmartNvmeAttribute{AttributeId: "warning_temp_time", Name: "Warning Temp Time", Value: info.NvmeSmartHealthInformationLog.WarningTempTime, Threshold: -1}).PopulateAttributeStatus(),
"critical_comp_time": (&SmartNvmeAttribute{AttributeId: "critical_comp_time", Name: "Critical CompTime", Value: info.NvmeSmartHealthInformationLog.CriticalCompTime, Threshold: -1}).PopulateAttributeStatus(),
}
//find analyzed attribute status
for _, val := range sm.Attributes {
if val.GetStatus() == pkg.SmartAttributeStatusFailed {
sm.Status = pkg.DeviceStatusFailedScrutiny
}
}
}
//generate SmartScsiAttribute entries from Scrutiny Collector Smart data.
func (sm *Smart) ProcessScsiSmartInfo(info collector.SmartInfo) {
sm.Attributes = map[string]SmartAttribute{
"scsi_grown_defect_list": &SmartScsiAttribute{AttributeId: "scsi_grown_defect_list", Name: "Grown Defect List", Value: info.ScsiGrownDefectList, Threshold: 0},
"read_errors_corrected_by_eccfast": &SmartScsiAttribute{AttributeId: "read_errors_corrected_by_eccfast", Name: "Read Errors Corrected by ECC Fast", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByEccfast, Threshold: -1},
"read_errors_corrected_by_eccdelayed": &SmartScsiAttribute{AttributeId: "read_errors_corrected_by_eccdelayed", Name: "Read Errors Corrected by ECC Delayed", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByEccdelayed, Threshold: -1},
"read_errors_corrected_by_rereads_rewrites": &SmartScsiAttribute{AttributeId: "read_errors_corrected_by_rereads_rewrites", Name: "Read Errors Corrected by ReReads/ReWrites", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByRereadsRewrites, Threshold: 0},
"read_total_errors_corrected": &SmartScsiAttribute{AttributeId: "read_total_errors_corrected", Name: "Read Total Errors Corrected", Value: info.ScsiErrorCounterLog.Read.TotalErrorsCorrected, Threshold: -1},
"read_correction_algorithm_invocations": &SmartScsiAttribute{AttributeId: "read_correction_algorithm_invocations", Name: "Read Correction Algorithm Invocations", Value: info.ScsiErrorCounterLog.Read.CorrectionAlgorithmInvocations, Threshold: -1},
"read_total_uncorrected_errors": &SmartScsiAttribute{AttributeId: "read_total_uncorrected_errors", Name: "Read Total Uncorrected Errors", Value: info.ScsiErrorCounterLog.Read.TotalUncorrectedErrors, Threshold: 0},
"write_errors_corrected_by_eccfast": &SmartScsiAttribute{AttributeId: "write_errors_corrected_by_eccfast", Name: "Write Errors Corrected by ECC Fast", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByEccfast, Threshold: -1},
"write_errors_corrected_by_eccdelayed": &SmartScsiAttribute{AttributeId: "write_errors_corrected_by_eccdelayed", Name: "Write Errors Corrected by ECC Delayed", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByEccdelayed, Threshold: -1},
"write_errors_corrected_by_rereads_rewrites": &SmartScsiAttribute{AttributeId: "write_errors_corrected_by_rereads_rewrites", Name: "Write Errors Corrected by ReReads/ReWrites", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByRereadsRewrites, Threshold: 0},
"write_total_errors_corrected": &SmartScsiAttribute{AttributeId: "write_total_errors_corrected", Name: "Write Total Errors Corrected", Value: info.ScsiErrorCounterLog.Write.TotalErrorsCorrected, Threshold: -1},
"write_correction_algorithm_invocations": &SmartScsiAttribute{AttributeId: "write_correction_algorithm_invocations", Name: "Write Correction Algorithm Invocations", Value: info.ScsiErrorCounterLog.Write.CorrectionAlgorithmInvocations, Threshold: -1},
"write_total_uncorrected_errors": &SmartScsiAttribute{AttributeId: "write_total_uncorrected_errors", Name: "Write Total Uncorrected Errors", Value: info.ScsiErrorCounterLog.Write.TotalUncorrectedErrors, Threshold: 0},
"scsi_grown_defect_list": (&SmartScsiAttribute{AttributeId: "scsi_grown_defect_list", Name: "Grown Defect List", Value: info.ScsiGrownDefectList, Threshold: 0}).PopulateAttributeStatus(),
"read_errors_corrected_by_eccfast": (&SmartScsiAttribute{AttributeId: "read_errors_corrected_by_eccfast", Name: "Read Errors Corrected by ECC Fast", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByEccfast, Threshold: -1}).PopulateAttributeStatus(),
"read_errors_corrected_by_eccdelayed": (&SmartScsiAttribute{AttributeId: "read_errors_corrected_by_eccdelayed", Name: "Read Errors Corrected by ECC Delayed", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByEccdelayed, Threshold: -1}).PopulateAttributeStatus(),
"read_errors_corrected_by_rereads_rewrites": (&SmartScsiAttribute{AttributeId: "read_errors_corrected_by_rereads_rewrites", Name: "Read Errors Corrected by ReReads/ReWrites", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByRereadsRewrites, Threshold: 0}).PopulateAttributeStatus(),
"read_total_errors_corrected": (&SmartScsiAttribute{AttributeId: "read_total_errors_corrected", Name: "Read Total Errors Corrected", Value: info.ScsiErrorCounterLog.Read.TotalErrorsCorrected, Threshold: -1}).PopulateAttributeStatus(),
"read_correction_algorithm_invocations": (&SmartScsiAttribute{AttributeId: "read_correction_algorithm_invocations", Name: "Read Correction Algorithm Invocations", Value: info.ScsiErrorCounterLog.Read.CorrectionAlgorithmInvocations, Threshold: -1}).PopulateAttributeStatus(),
"read_total_uncorrected_errors": (&SmartScsiAttribute{AttributeId: "read_total_uncorrected_errors", Name: "Read Total Uncorrected Errors", Value: info.ScsiErrorCounterLog.Read.TotalUncorrectedErrors, Threshold: 0}).PopulateAttributeStatus(),
"write_errors_corrected_by_eccfast": (&SmartScsiAttribute{AttributeId: "write_errors_corrected_by_eccfast", Name: "Write Errors Corrected by ECC Fast", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByEccfast, Threshold: -1}).PopulateAttributeStatus(),
"write_errors_corrected_by_eccdelayed": (&SmartScsiAttribute{AttributeId: "write_errors_corrected_by_eccdelayed", Name: "Write Errors Corrected by ECC Delayed", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByEccdelayed, Threshold: -1}).PopulateAttributeStatus(),
"write_errors_corrected_by_rereads_rewrites": (&SmartScsiAttribute{AttributeId: "write_errors_corrected_by_rereads_rewrites", Name: "Write Errors Corrected by ReReads/ReWrites", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByRereadsRewrites, Threshold: 0}).PopulateAttributeStatus(),
"write_total_errors_corrected": (&SmartScsiAttribute{AttributeId: "write_total_errors_corrected", Name: "Write Total Errors Corrected", Value: info.ScsiErrorCounterLog.Write.TotalErrorsCorrected, Threshold: -1}).PopulateAttributeStatus(),
"write_correction_algorithm_invocations": (&SmartScsiAttribute{AttributeId: "write_correction_algorithm_invocations", Name: "Write Correction Algorithm Invocations", Value: info.ScsiErrorCounterLog.Write.CorrectionAlgorithmInvocations, Threshold: -1}).PopulateAttributeStatus(),
"write_total_uncorrected_errors": (&SmartScsiAttribute{AttributeId: "write_total_uncorrected_errors", Name: "Write Total Uncorrected Errors", Value: info.ScsiErrorCounterLog.Write.TotalUncorrectedErrors, Threshold: 0}).PopulateAttributeStatus(),
}
//find analyzed attribute status
for _, val := range sm.Attributes {
if val.GetStatus() == pkg.SmartAttributeStatusFailed {
sm.Status = pkg.DeviceStatusFailedScrutiny
}
}
}

@ -2,14 +2,12 @@ package measurements
import (
"fmt"
"github.com/analogj/scrutiny/webapp/backend/pkg"
"github.com/analogj/scrutiny/webapp/backend/pkg/thresholds"
"strconv"
"strings"
)
const SmartAttributeStatusPassed = "passed"
const SmartAttributeStatusFailed = "failed"
const SmartAttributeStatusWarning = "warn"
type SmartAtaAttribute struct {
AttributeId int `json:"attribute_id"`
Name string `json:"name"`
@ -27,6 +25,10 @@ type SmartAtaAttribute struct {
FailureRate float64 `json:"failure_rate,omitempty"`
}
func (sa *SmartAtaAttribute) GetStatus() string {
return sa.Status
}
func (sa *SmartAtaAttribute) Flatten() map[string]interface{} {
idString := strconv.Itoa(sa.AttributeId)
@ -71,81 +73,82 @@ func (sa *SmartAtaAttribute) Inflate(key string, val interface{}) {
}
}
//
////populate attribute status, using SMART Thresholds & Observed Metadata
//func (sa *SmartAtaAttribute) PopulateAttributeStatus() {
// if strings.ToUpper(sa.WhenFailed) == SmartWhenFailedFailingNow {
// //this attribute has previously failed
// sa.Status = SmartAttributeStatusFailed
// sa.StatusReason = "Attribute is failing manufacturer SMART threshold"
//
// } else if strings.ToUpper(sa.WhenFailed) == SmartWhenFailedInThePast {
// sa.Status = SmartAttributeStatusWarning
// sa.StatusReason = "Attribute has previously failed manufacturer SMART threshold"
// }
//
// if smartMetadata, ok := metadata.AtaMetadata[sa.AttributeId]; ok {
// sa.MetadataObservedThresholdStatus(smartMetadata)
// }
//
// //check if status is blank, set to "passed"
// if len(sa.Status) == 0 {
// sa.Status = SmartAttributeStatusPassed
// }
//}
//
//// compare the attribute (raw, normalized, transformed) value to observed thresholds, and update status if necessary
//func (sa *SmartAtaAttribute) MetadataObservedThresholdStatus(smartMetadata metadata.AtaAttributeMetadata) {
// //TODO: multiple rules
// // try to predict the failure rates for observed thresholds that have 0 failure rate and error bars.
// // - if the attribute is critical
// // - the failure rate is over 10 - set to failed
// // - the attribute does not match any threshold, set to warn
// // - if the attribute is not critical
// // - if failure rate is above 20 - set to failed
// // - if failure rate is above 10 but below 20 - set to warn
//
// //update the smart attribute status based on Observed thresholds.
// var value int64
// if smartMetadata.DisplayType == metadata.AtaSmartAttributeDisplayTypeNormalized {
// value = int64(sa.Value)
// } else if smartMetadata.DisplayType == metadata.AtaSmartAttributeDisplayTypeTransformed {
// value = sa.TransformedValue
// } else {
// value = sa.RawValue
// }
//
// for _, obsThresh := range smartMetadata.ObservedThresholds {
//
// //check if "value" is in this bucket
// if ((obsThresh.Low == obsThresh.High) && value == obsThresh.Low) ||
// (obsThresh.Low < value && value <= obsThresh.High) {
// sa.FailureRate = obsThresh.AnnualFailureRate
//
// if smartMetadata.Critical {
// if obsThresh.AnnualFailureRate >= 0.10 {
// sa.Status = SmartAttributeStatusFailed
// sa.StatusReason = "Observed Failure Rate for Critical Attribute is greater than 10%"
// }
// } else {
// if obsThresh.AnnualFailureRate >= 0.20 {
// sa.Status = SmartAttributeStatusFailed
// sa.StatusReason = "Observed Failure Rate for Attribute is greater than 20%"
// } else if obsThresh.AnnualFailureRate >= 0.10 {
// sa.Status = SmartAttributeStatusWarning
// sa.StatusReason = "Observed Failure Rate for Attribute is greater than 10%"
// }
// }
//
// //we've found the correct bucket, we can drop out of this loop
// return
// }
// }
// // no bucket found
// if smartMetadata.Critical {
// sa.Status = SmartAttributeStatusWarning
// sa.StatusReason = "Could not determine Observed Failure Rate for Critical Attribute"
// }
//
// return
//}
//populate attribute status, using SMART Thresholds & Observed Metadata
// Chainable
func (sa *SmartAtaAttribute) PopulateAttributeStatus() *SmartAtaAttribute {
if strings.ToUpper(sa.WhenFailed) == pkg.SmartWhenFailedFailingNow {
//this attribute has previously failed
sa.Status = pkg.SmartAttributeStatusFailed
sa.StatusReason = "Attribute is failing manufacturer SMART threshold"
} else if strings.ToUpper(sa.WhenFailed) == pkg.SmartWhenFailedInThePast {
sa.Status = pkg.SmartAttributeStatusWarning
sa.StatusReason = "Attribute has previously failed manufacturer SMART threshold"
}
if smartMetadata, ok := thresholds.AtaMetadata[sa.AttributeId]; ok {
sa.ValidateThreshold(smartMetadata)
}
//check if status is blank, set to "passed"
if len(sa.Status) == 0 {
sa.Status = pkg.SmartAttributeStatusPassed
}
return sa
}
// compare the attribute (raw, normalized, transformed) value to observed thresholds, and update status if necessary
func (sa *SmartAtaAttribute) ValidateThreshold(smartMetadata thresholds.AtaAttributeMetadata) {
//TODO: multiple rules
// try to predict the failure rates for observed thresholds that have 0 failure rate and error bars.
// - if the attribute is critical
// - the failure rate is over 10 - set to failed
// - the attribute does not match any threshold, set to warn
// - if the attribute is not critical
// - if failure rate is above 20 - set to failed
// - if failure rate is above 10 but below 20 - set to warn
//update the smart attribute status based on Observed thresholds.
var value int64
if smartMetadata.DisplayType == thresholds.AtaSmartAttributeDisplayTypeNormalized {
value = int64(sa.Value)
} else if smartMetadata.DisplayType == thresholds.AtaSmartAttributeDisplayTypeTransformed {
value = sa.TransformedValue
} else {
value = sa.RawValue
}
for _, obsThresh := range smartMetadata.ObservedThresholds {
//check if "value" is in this bucket
if ((obsThresh.Low == obsThresh.High) && value == obsThresh.Low) ||
(obsThresh.Low < value && value <= obsThresh.High) {
sa.FailureRate = obsThresh.AnnualFailureRate
if smartMetadata.Critical {
if obsThresh.AnnualFailureRate >= 0.10 {
sa.Status = pkg.SmartAttributeStatusFailed
sa.StatusReason = "Observed Failure Rate for Critical Attribute is greater than 10%"
}
} else {
if obsThresh.AnnualFailureRate >= 0.20 {
sa.Status = pkg.SmartAttributeStatusFailed
sa.StatusReason = "Observed Failure Rate for Attribute is greater than 20%"
} else if obsThresh.AnnualFailureRate >= 0.10 {
sa.Status = pkg.SmartAttributeStatusWarning
sa.StatusReason = "Observed Failure Rate for Attribute is greater than 10%"
}
}
//we've found the correct bucket, we can drop out of this loop
return
}
}
// no bucket found
if smartMetadata.Critical {
sa.Status = pkg.SmartAttributeStatusWarning
sa.StatusReason = "Could not determine Observed Failure Rate for Critical Attribute"
}
return
}

@ -3,4 +3,5 @@ package measurements
type SmartAttribute interface {
Flatten() (fields map[string]interface{})
Inflate(key string, val interface{})
GetStatus() string
}

@ -2,6 +2,8 @@ package measurements
import (
"fmt"
"github.com/analogj/scrutiny/webapp/backend/pkg"
"github.com/analogj/scrutiny/webapp/backend/pkg/thresholds"
"strings"
)
@ -17,6 +19,10 @@ type SmartNvmeAttribute struct {
FailureRate float64 `json:"failure_rate,omitempty"`
}
func (sa *SmartNvmeAttribute) GetStatus() string {
return sa.Status
}
func (sa *SmartNvmeAttribute) Flatten() map[string]interface{} {
return map[string]interface{}{
fmt.Sprintf("attr.%s.attribute_id", sa.AttributeId): sa.AttributeId,
@ -44,25 +50,26 @@ func (sa *SmartNvmeAttribute) Inflate(key string, val interface{}) {
}
}
//
////populate attribute status, using SMART Thresholds & Observed Metadata
//func (sa *SmartNvmeAttribute) PopulateAttributeStatus() {
//
// //-1 is a special number meaning no threshold.
// if sa.Threshold != -1 {
// if smartMetadata, ok := metadata.NmveMetadata[sa.AttributeId]; ok {
// //check what the ideal is. Ideal tells us if we our recorded value needs to be above, or below the threshold
// if (smartMetadata.Ideal == "low" && sa.Value > sa.Threshold) ||
// (smartMetadata.Ideal == "high" && sa.Value < sa.Threshold) {
// sa.Status = SmartAttributeStatusFailed
// sa.StatusReason = "Attribute is failing recommended SMART threshold"
// }
// }
// }
// //TODO: eventually figure out the critical_warning bits and determine correct error messages here.
//
// //check if status is blank, set to "passed"
// if len(sa.Status) == 0 {
// sa.Status = SmartAttributeStatusPassed
// }
//}
//populate attribute status, using SMART Thresholds & Observed Metadata
// Chainable
func (sa *SmartNvmeAttribute) PopulateAttributeStatus() *SmartNvmeAttribute {
//-1 is a special number meaning no threshold.
if sa.Threshold != -1 {
if smartMetadata, ok := thresholds.NmveMetadata[sa.AttributeId]; ok {
//check what the ideal is. Ideal tells us if we our recorded value needs to be above, or below the threshold
if (smartMetadata.Ideal == "low" && sa.Value > sa.Threshold) ||
(smartMetadata.Ideal == "high" && sa.Value < sa.Threshold) {
sa.Status = pkg.SmartAttributeStatusFailed
sa.StatusReason = "Attribute is failing recommended SMART threshold"
}
}
}
//TODO: eventually figure out the critical_warning bits and determine correct error messages here.
//check if status is blank, set to "passed"
if len(sa.Status) == 0 {
sa.Status = pkg.SmartAttributeStatusPassed
}
return sa
}

@ -2,6 +2,8 @@ package measurements
import (
"fmt"
"github.com/analogj/scrutiny/webapp/backend/pkg"
"github.com/analogj/scrutiny/webapp/backend/pkg/thresholds"
"strings"
)
@ -17,6 +19,10 @@ type SmartScsiAttribute struct {
FailureRate float64 `json:"failure_rate,omitempty"`
}
func (sa *SmartScsiAttribute) GetStatus() string {
return sa.Status
}
func (sa *SmartScsiAttribute) Flatten() map[string]interface{} {
return map[string]interface{}{
fmt.Sprintf("attr.%s.attribute_id", sa.AttributeId): sa.AttributeId,
@ -45,23 +51,25 @@ func (sa *SmartScsiAttribute) Inflate(key string, val interface{}) {
}
//
////populate attribute status, using SMART Thresholds & Observed Metadata
//func (sa *SmartScsiAttribute) PopulateAttributeStatus() {
//
// //-1 is a special number meaning no threshold.
// if sa.Threshold != -1 {
// if smartMetadata, ok := metadata.NmveMetadata[sa.AttributeId]; ok {
// //check what the ideal is. Ideal tells us if we our recorded value needs to be above, or below the threshold
// if (smartMetadata.Ideal == "low" && sa.Value > sa.Threshold) ||
// (smartMetadata.Ideal == "high" && sa.Value < sa.Threshold) {
// sa.Status = SmartAttributeStatusFailed
// sa.StatusReason = "Attribute is failing recommended SMART threshold"
// }
// }
// }
//
// //check if status is blank, set to "passed"
// if len(sa.Status) == 0 {
// sa.Status = SmartAttributeStatusPassed
// }
//}
//populate attribute status, using SMART Thresholds & Observed Metadata
//Chainable
func (sa *SmartScsiAttribute) PopulateAttributeStatus() *SmartScsiAttribute {
//-1 is a special number meaning no threshold.
if sa.Threshold != -1 {
if smartMetadata, ok := thresholds.NmveMetadata[sa.AttributeId]; ok {
//check what the ideal is. Ideal tells us if we our recorded value needs to be above, or below the threshold
if (smartMetadata.Ideal == "low" && sa.Value > sa.Threshold) ||
(smartMetadata.Ideal == "high" && sa.Value < sa.Threshold) {
sa.Status = pkg.SmartAttributeStatusFailed
sa.StatusReason = "Attribute is failing recommended SMART threshold"
}
}
}
//check if status is blank, set to "passed"
if len(sa.Status) == 0 {
sa.Status = pkg.SmartAttributeStatusPassed
}
return sa
}

@ -1,4 +1,4 @@
package metadata
package thresholds
const AtaSmartAttributeDisplayTypeRaw = "raw"
const AtaSmartAttributeDisplayTypeNormalized = "normalized"

@ -1,4 +1,4 @@
package metadata
package thresholds
// https://media.kingston.com/support/downloads/MKP_521.6_SMART-DCP1000_attribute.pdf
// https://www.percona.com/blog/2017/02/09/using-nvme-command-line-tools-to-check-nvme-flash-health/

@ -1,4 +1,4 @@
package metadata
package thresholds
type ScsiAttributeMetadata struct {
ID string `json:"-"`

@ -2,7 +2,7 @@ package handler
import (
"github.com/analogj/scrutiny/webapp/backend/pkg/database"
"github.com/analogj/scrutiny/webapp/backend/pkg/metadata"
"github.com/analogj/scrutiny/webapp/backend/pkg/thresholds"
"github.com/gin-gonic/gin"
"github.com/sirupsen/logrus"
"net/http"
@ -23,11 +23,11 @@ func GetDeviceDetails(c *gin.Context) {
var deviceMetadata interface{}
if device.IsAta() {
deviceMetadata = metadata.AtaMetadata
deviceMetadata = thresholds.AtaMetadata
} else if device.IsNvme() {
deviceMetadata = metadata.NmveMetadata
deviceMetadata = thresholds.NmveMetadata
} else if device.IsScsi() {
deviceMetadata = metadata.ScsiMetadata
deviceMetadata = thresholds.ScsiMetadata
}
c.JSON(http.StatusOK, gin.H{"success": true, "data": map[string]interface{}{"device": device, "smart_results": smartResults}, "metadata": deviceMetadata})

@ -37,13 +37,23 @@ func UploadDeviceMetrics(c *gin.Context) {
}
// insert smart info
_, err = deviceRepo.SaveSmartAttributes(c, c.Param("wwn"), collectorSmartData)
smartData, err := deviceRepo.SaveSmartAttributes(c, c.Param("wwn"), collectorSmartData)
if err != nil {
logger.Errorln("An error occurred while saving smartctl metrics", err)
c.JSON(http.StatusInternalServerError, gin.H{"success": false})
return
}
if smartData.Status != pkg.DeviceStatusPassed {
//there is a failure detected by Scrutiny, update the device status on the homepage.
updatedDevice, err = deviceRepo.UpdateDeviceStatus(c, c.Param("wwn"), smartData.Status)
if err != nil {
logger.Errorln("An error occurred while updating device status", err)
c.JSON(http.StatusInternalServerError, gin.H{"success": false})
return
}
}
// save smart temperature data (ignore failures)
err = deviceRepo.SaveSmartTemperature(c, c.Param("wwn"), updatedDevice.DeviceProtocol, collectorSmartData)
if err != nil {

Loading…
Cancel
Save