Merge pull request #46 from AnalogJHQ/scsi_nvme_ui

added ideal and thresholds for NVMe and SCSI drives.
pull/1/head
Jason Kulatunga 4 years ago committed by GitHub
commit ea9e7ec218
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

1
.gitignore vendored

@ -61,3 +61,4 @@ vendor
/scrutiny-collector-metrics-linux-amd64
/scrutiny-web-linux-amd64
scrutiny-*.db
scrutiny_test.db

@ -21,7 +21,7 @@ var NmveMetadata = map[string]NvmeAttributeMetadata{
ID: "critical_warning",
DisplayName: "Critical Warning",
DisplayType: "",
Ideal: "",
Ideal: "low",
Critical: true,
Description: "This field indicates critical warnings for the state of the controller. Each bit corresponds to a critical warning type; multiple bits may be set. If a bit is cleared to 0, then that critical warning does not apply. Critical warnings may result in an asynchronous event notification to the host. Bits in this field represent the current associated state and are not persistent.",
},
@ -37,7 +37,7 @@ var NmveMetadata = map[string]NvmeAttributeMetadata{
ID: "available_spare",
DisplayName: "Available Spare",
DisplayType: "",
Ideal: "",
Ideal: "high",
Critical: true,
Description: "Contains a normalized percentage (0 to 100%) of the remaining spare capacity available.",
},
@ -45,7 +45,7 @@ var NmveMetadata = map[string]NvmeAttributeMetadata{
ID: "percentage_used",
DisplayName: "Percentage Used",
DisplayType: "",
Ideal: "",
Ideal: "low",
Critical: true,
Description: "Contains a vendor specific estimate of the percentage of NVM subsystem life used based on the actual usage and the manufacturers prediction of NVM life. A value of 100 indicates that the estimated endurance of the NVM in the NVM subsystem has been consumed, but may not indicate an NVM subsystem failure. The value is allowed to exceed 100. Percentages greater than 254 shall be represented as 255. This value shall be updated once per power-on hour (when the controller is not in a sleep state).",
},
@ -117,7 +117,7 @@ var NmveMetadata = map[string]NvmeAttributeMetadata{
ID: "media_errors",
DisplayName: "Media Errors",
DisplayType: "",
Ideal: "",
Ideal: "low",
Critical: true,
Description: "Contains the number of occurrences where the controller detected an unrecovered data integrity error. Errors such as uncorrectable ECC, CRC checksum failure, or LBA tag mismatch are included in this field.",
},
@ -125,7 +125,7 @@ var NmveMetadata = map[string]NvmeAttributeMetadata{
ID: "num_err_log_entries",
DisplayName: "Numb Err Log Entries",
DisplayType: "",
Ideal: "",
Ideal: "low",
Critical: true,
Description: "Contains the number of Error Information log entries over the life of the controller.",
},

@ -17,7 +17,7 @@ var ScsiMetadata = map[string]ScsiAttributeMetadata{
ID: "scsi_grown_defect_list",
DisplayName: "Grown Defect List",
DisplayType: "",
Ideal: "",
Ideal: "low",
Critical: true,
Description: "",
},
@ -41,7 +41,7 @@ var ScsiMetadata = map[string]ScsiAttributeMetadata{
ID: "read.errors_corrected_by_rereads_rewrites",
DisplayName: "Read Errors Corrected by ReReads/ReWrites",
DisplayType: "",
Ideal: "",
Ideal: "low",
Critical: true,
Description: "",
},
@ -65,7 +65,7 @@ var ScsiMetadata = map[string]ScsiAttributeMetadata{
ID: "read.total_uncorrected_errors",
DisplayName: "Read Total Uncorrected Errors",
DisplayType: "",
Ideal: "",
Ideal: "low",
Critical: true,
Description: "",
},
@ -89,7 +89,7 @@ var ScsiMetadata = map[string]ScsiAttributeMetadata{
ID: "write.errors_corrected_by_rereads_rewrites",
DisplayName: "Write Errors Corrected by ReReads/ReWrites",
DisplayType: "",
Ideal: "",
Ideal: "low",
Critical: true,
Description: "",
},
@ -113,7 +113,7 @@ var ScsiMetadata = map[string]ScsiAttributeMetadata{
ID: "write.total_uncorrected_errors",
DisplayName: "Write Total Uncorrected Errors",
DisplayType: "",
Ideal: "",
Ideal: "low",
Critical: true,
Description: "",
},

@ -1,9 +1,7 @@
package db
import (
"github.com/analogj/scrutiny/webapp/backend/pkg/metadata"
"github.com/analogj/scrutiny/webapp/backend/pkg/models/collector"
"strings"
"time"
)
@ -130,33 +128,24 @@ func (dv *Device) SquashHistory() error {
}
func (dv *Device) ApplyMetadataRules() error {
if !dv.IsAta() {
// Scrutiny Observed thresholds not yet available for NVME or SCSI drives
// since most SMART attributes are not present and BackBlaze data not available
return nil
}
//embed metadata in the latest smart attributes object
if len(dv.SmartResults) > 0 && len(dv.SmartResults[0].AtaAttributes) > 0 {
if len(dv.SmartResults) > 0 {
for ndx, attr := range dv.SmartResults[0].AtaAttributes {
if strings.ToUpper(attr.WhenFailed) == SmartWhenFailedFailingNow {
//this attribute has previously failed
dv.SmartResults[0].AtaAttributes[ndx].Status = SmartAttributeStatusFailed
dv.SmartResults[0].AtaAttributes[ndx].StatusReason = "Attribute is failing manufacturer SMART threshold"
} else if strings.ToUpper(attr.WhenFailed) == SmartWhenFailedInThePast {
dv.SmartResults[0].AtaAttributes[ndx].Status = SmartAttributeStatusWarning
dv.SmartResults[0].AtaAttributes[ndx].StatusReason = "Attribute has previously failed manufacturer SMART threshold"
attr.PopulateAttributeStatus()
dv.SmartResults[0].AtaAttributes[ndx] = attr
}
if smartMetadata, ok := metadata.AtaMetadata[attr.AttributeId]; ok {
dv.SmartResults[0].AtaAttributes[ndx].MetadataObservedThresholdStatus(smartMetadata)
}
for ndx, attr := range dv.SmartResults[0].NvmeAttributes {
attr.PopulateAttributeStatus()
dv.SmartResults[0].NvmeAttributes[ndx] = attr
//check if status is blank, set to "passed"
if len(dv.SmartResults[0].AtaAttributes[ndx].Status) == 0 {
dv.SmartResults[0].AtaAttributes[ndx].Status = SmartAttributeStatusPassed
}
for ndx, attr := range dv.SmartResults[0].ScsiAttributes {
attr.PopulateAttributeStatus()
dv.SmartResults[0].ScsiAttributes[ndx] = attr
}
}
return nil

@ -29,6 +29,7 @@ type Smart struct {
ScsiAttributes []SmartScsiAttribute `json:"scsi_attributes" gorm:"foreignkey:SmartId"`
}
//Parse Collector SMART data results and create Smart object (and associated SmartAtaAttribute entries)
func (sm *Smart) FromCollectorSmartInfo(wwn string, info collector.SmartInfo) error {
sm.DeviceWWN = wwn
sm.TestDate = time.Unix(info.LocalTime.TimeT, 0)
@ -55,6 +56,7 @@ func (sm *Smart) FromCollectorSmartInfo(wwn string, info collector.SmartInfo) er
return nil
}
//generate SmartAtaAttribute entries from Scrutiny Collector Smart data.
func (sm *Smart) ProcessAtaSmartInfo(info collector.SmartInfo) {
sm.AtaAttributes = []SmartAtaAttribute{}
for _, collectorAttr := range info.AtaSmartAttributes.Table {
@ -80,41 +82,43 @@ func (sm *Smart) ProcessAtaSmartInfo(info collector.SmartInfo) {
}
}
//generate SmartNvmeAttribute entries from Scrutiny Collector Smart data.
func (sm *Smart) ProcessNvmeSmartInfo(info collector.SmartInfo) {
sm.NvmeAttributes = []SmartNvmeAttribute{
{AttributeId: "critical_warning", Name: "Critical Warning", Value: info.NvmeSmartHealthInformationLog.CriticalWarning},
{AttributeId: "temperature", Name: "Temperature", Value: info.NvmeSmartHealthInformationLog.Temperature},
{AttributeId: "critical_warning", Name: "Critical Warning", Value: info.NvmeSmartHealthInformationLog.CriticalWarning, Threshold: 0},
{AttributeId: "temperature", Name: "Temperature", Value: info.NvmeSmartHealthInformationLog.Temperature, Threshold: -1},
{AttributeId: "available_spare", Name: "Available Spare", Value: info.NvmeSmartHealthInformationLog.AvailableSpare, Threshold: info.NvmeSmartHealthInformationLog.AvailableSpareThreshold},
{AttributeId: "percentage_used", Name: "Percentage Used", Value: info.NvmeSmartHealthInformationLog.PercentageUsed, Threshold: 100},
{AttributeId: "data_units_read", Name: "Data Units Read", Value: info.NvmeSmartHealthInformationLog.DataUnitsRead},
{AttributeId: "data_units_written", Name: "Data Units Written", Value: info.NvmeSmartHealthInformationLog.DataUnitsWritten},
{AttributeId: "host_reads", Name: "Host Reads", Value: info.NvmeSmartHealthInformationLog.HostReads},
{AttributeId: "host_writes", Name: "Host Writes", Value: info.NvmeSmartHealthInformationLog.HostWrites},
{AttributeId: "controller_busy_time", Name: "Controller Busy Time", Value: info.NvmeSmartHealthInformationLog.ControllerBusyTime},
{AttributeId: "power_cycles", Name: "Power Cycles", Value: info.NvmeSmartHealthInformationLog.PowerCycles},
{AttributeId: "power_on_hours", Name: "Power on Hours", Value: info.NvmeSmartHealthInformationLog.PowerOnHours},
{AttributeId: "unsafe_shutdowns", Name: "Unsafe Shutdowns", Value: info.NvmeSmartHealthInformationLog.UnsafeShutdowns},
{AttributeId: "media_errors", Name: "Media Errors", Value: info.NvmeSmartHealthInformationLog.MediaErrors},
{AttributeId: "num_err_log_entries", Name: "Numb Err Log Entries", Value: info.NvmeSmartHealthInformationLog.NumErrLogEntries},
{AttributeId: "warning_temp_time", Name: "Warning Temp Time", Value: info.NvmeSmartHealthInformationLog.WarningTempTime},
{AttributeId: "critical_comp_time", Name: "Critical CompTime", Value: info.NvmeSmartHealthInformationLog.CriticalCompTime},
{AttributeId: "data_units_read", Name: "Data Units Read", Value: info.NvmeSmartHealthInformationLog.DataUnitsRead, Threshold: -1},
{AttributeId: "data_units_written", Name: "Data Units Written", Value: info.NvmeSmartHealthInformationLog.DataUnitsWritten, Threshold: -1},
{AttributeId: "host_reads", Name: "Host Reads", Value: info.NvmeSmartHealthInformationLog.HostReads, Threshold: -1},
{AttributeId: "host_writes", Name: "Host Writes", Value: info.NvmeSmartHealthInformationLog.HostWrites, Threshold: -1},
{AttributeId: "controller_busy_time", Name: "Controller Busy Time", Value: info.NvmeSmartHealthInformationLog.ControllerBusyTime, Threshold: -1},
{AttributeId: "power_cycles", Name: "Power Cycles", Value: info.NvmeSmartHealthInformationLog.PowerCycles, Threshold: -1},
{AttributeId: "power_on_hours", Name: "Power on Hours", Value: info.NvmeSmartHealthInformationLog.PowerOnHours, Threshold: -1},
{AttributeId: "unsafe_shutdowns", Name: "Unsafe Shutdowns", Value: info.NvmeSmartHealthInformationLog.UnsafeShutdowns, Threshold: -1},
{AttributeId: "media_errors", Name: "Media Errors", Value: info.NvmeSmartHealthInformationLog.MediaErrors, Threshold: 0},
{AttributeId: "num_err_log_entries", Name: "Numb Err Log Entries", Value: info.NvmeSmartHealthInformationLog.NumErrLogEntries, Threshold: 0},
{AttributeId: "warning_temp_time", Name: "Warning Temp Time", Value: info.NvmeSmartHealthInformationLog.WarningTempTime, Threshold: -1},
{AttributeId: "critical_comp_time", Name: "Critical CompTime", Value: info.NvmeSmartHealthInformationLog.CriticalCompTime, Threshold: -1},
}
}
//generate SmartScsiAttribute entries from Scrutiny Collector Smart data.
func (sm *Smart) ProcessScsiSmartInfo(info collector.SmartInfo) {
sm.ScsiAttributes = []SmartScsiAttribute{
{AttributeId: "scsi_grown_defect_list", Name: "Grown Defect List", Value: info.ScsiGrownDefectList},
{AttributeId: "read.errors_corrected_by_eccfast", Name: "Read Errors Corrected by ECC Fast", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByEccfast},
{AttributeId: "read.errors_corrected_by_eccdelayed", Name: "Read Errors Corrected by ECC Delayed", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByEccdelayed},
{AttributeId: "read.errors_corrected_by_rereads_rewrites", Name: "Read Errors Corrected by ReReads/ReWrites", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByRereadsRewrites},
{AttributeId: "read.total_errors_corrected", Name: "Read Total Errors Corrected", Value: info.ScsiErrorCounterLog.Read.TotalErrorsCorrected},
{AttributeId: "read.correction_algorithm_invocations", Name: "Read Correction Algorithm Invocations", Value: info.ScsiErrorCounterLog.Read.CorrectionAlgorithmInvocations},
{AttributeId: "read.total_uncorrected_errors", Name: "Read Total Uncorrected Errors", Value: info.ScsiErrorCounterLog.Read.TotalUncorrectedErrors},
{AttributeId: "write.errors_corrected_by_eccfast", Name: "Write Errors Corrected by ECC Fast", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByEccfast},
{AttributeId: "write.errors_corrected_by_eccdelayed", Name: "Write Errors Corrected by ECC Delayed", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByEccdelayed},
{AttributeId: "write.errors_corrected_by_rereads_rewrites", Name: "Write Errors Corrected by ReReads/ReWrites", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByRereadsRewrites},
{AttributeId: "write.total_errors_corrected", Name: "Write Total Errors Corrected", Value: info.ScsiErrorCounterLog.Write.TotalErrorsCorrected},
{AttributeId: "write.correction_algorithm_invocations", Name: "Write Correction Algorithm Invocations", Value: info.ScsiErrorCounterLog.Write.CorrectionAlgorithmInvocations},
{AttributeId: "write.total_uncorrected_errors", Name: "Write Total Uncorrected Errors", Value: info.ScsiErrorCounterLog.Write.TotalUncorrectedErrors},
{AttributeId: "scsi_grown_defect_list", Name: "Grown Defect List", Value: info.ScsiGrownDefectList, Threshold: 0},
{AttributeId: "read.errors_corrected_by_eccfast", Name: "Read Errors Corrected by ECC Fast", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByEccfast, Threshold: -1},
{AttributeId: "read.errors_corrected_by_eccdelayed", Name: "Read Errors Corrected by ECC Delayed", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByEccdelayed, Threshold: -1},
{AttributeId: "read.errors_corrected_by_rereads_rewrites", Name: "Read Errors Corrected by ReReads/ReWrites", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByRereadsRewrites, Threshold: 0},
{AttributeId: "read.total_errors_corrected", Name: "Read Total Errors Corrected", Value: info.ScsiErrorCounterLog.Read.TotalErrorsCorrected, Threshold: -1},
{AttributeId: "read.correction_algorithm_invocations", Name: "Read Correction Algorithm Invocations", Value: info.ScsiErrorCounterLog.Read.CorrectionAlgorithmInvocations, Threshold: -1},
{AttributeId: "read.total_uncorrected_errors", Name: "Read Total Uncorrected Errors", Value: info.ScsiErrorCounterLog.Read.TotalUncorrectedErrors, Threshold: 0},
{AttributeId: "write.errors_corrected_by_eccfast", Name: "Write Errors Corrected by ECC Fast", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByEccfast, Threshold: -1},
{AttributeId: "write.errors_corrected_by_eccdelayed", Name: "Write Errors Corrected by ECC Delayed", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByEccdelayed, Threshold: -1},
{AttributeId: "write.errors_corrected_by_rereads_rewrites", Name: "Write Errors Corrected by ReReads/ReWrites", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByRereadsRewrites, Threshold: 0},
{AttributeId: "write.total_errors_corrected", Name: "Write Total Errors Corrected", Value: info.ScsiErrorCounterLog.Write.TotalErrorsCorrected, Threshold: -1},
{AttributeId: "write.correction_algorithm_invocations", Name: "Write Correction Algorithm Invocations", Value: info.ScsiErrorCounterLog.Write.CorrectionAlgorithmInvocations, Threshold: -1},
{AttributeId: "write.total_uncorrected_errors", Name: "Write Total Uncorrected Errors", Value: info.ScsiErrorCounterLog.Write.TotalUncorrectedErrors, Threshold: 0},
}
}

@ -3,6 +3,7 @@ package db
import (
"github.com/analogj/scrutiny/webapp/backend/pkg/metadata"
"github.com/jinzhu/gorm"
"strings"
)
const SmartAttributeStatusPassed = "passed"
@ -31,6 +32,28 @@ type SmartAtaAttribute struct {
History []SmartAtaAttribute `gorm:"-" json:"history,omitempty"`
}
//populate attribute status, using SMART Thresholds & Observed Metadata
func (sa *SmartAtaAttribute) PopulateAttributeStatus() {
if strings.ToUpper(sa.WhenFailed) == SmartWhenFailedFailingNow {
//this attribute has previously failed
sa.Status = SmartAttributeStatusFailed
sa.StatusReason = "Attribute is failing manufacturer SMART threshold"
} else if strings.ToUpper(sa.WhenFailed) == SmartWhenFailedInThePast {
sa.Status = SmartAttributeStatusWarning
sa.StatusReason = "Attribute has previously failed manufacturer SMART threshold"
}
if smartMetadata, ok := metadata.AtaMetadata[sa.AttributeId]; ok {
sa.MetadataObservedThresholdStatus(smartMetadata)
}
//check if status is blank, set to "passed"
if len(sa.Status) == 0 {
sa.Status = SmartAttributeStatusPassed
}
}
// compare the attribute (raw, normalized, transformed) value to observed thresholds, and update status if necessary
func (sa *SmartAtaAttribute) MetadataObservedThresholdStatus(smartMetadata metadata.AtaAttributeMetadata) {
//TODO: multiple rules

@ -1,6 +1,9 @@
package db
import "github.com/jinzhu/gorm"
import (
"github.com/analogj/scrutiny/webapp/backend/pkg/metadata"
"github.com/jinzhu/gorm"
)
type SmartNvmeAttribute struct {
gorm.Model
@ -19,3 +22,25 @@ type SmartNvmeAttribute struct {
FailureRate float64 `gorm:"-" json:"failure_rate,omitempty"`
History []SmartNvmeAttribute `gorm:"-" json:"history,omitempty"`
}
//populate attribute status, using SMART Thresholds & Observed Metadata
func (sa *SmartNvmeAttribute) PopulateAttributeStatus() {
//-1 is a special number meaning no threshold.
if sa.Threshold != -1 {
if smartMetadata, ok := metadata.NmveMetadata[sa.AttributeId]; ok {
//check what the ideal is. Ideal tells us if we our recorded value needs to be above, or below the threshold
if (smartMetadata.Ideal == "low" && sa.Value > sa.Threshold) ||
(smartMetadata.Ideal == "high" && sa.Value < sa.Threshold) {
sa.Status = SmartAttributeStatusFailed
sa.StatusReason = "Attribute is failing recommended SMART threshold"
}
}
}
//TODO: eventually figure out the critical_warning bits and determine correct error messages here.
//check if status is blank, set to "passed"
if len(sa.Status) == 0 {
sa.Status = SmartAttributeStatusPassed
}
}

@ -1,6 +1,9 @@
package db
import "github.com/jinzhu/gorm"
import (
"github.com/analogj/scrutiny/webapp/backend/pkg/metadata"
"github.com/jinzhu/gorm"
)
type SmartScsiAttribute struct {
gorm.Model
@ -19,3 +22,24 @@ type SmartScsiAttribute struct {
FailureRate float64 `gorm:"-" json:"failure_rate,omitempty"`
History []SmartScsiAttribute `gorm:"-" json:"history,omitempty"`
}
//populate attribute status, using SMART Thresholds & Observed Metadata
func (sa *SmartScsiAttribute) PopulateAttributeStatus() {
//-1 is a special number meaning no threshold.
if sa.Threshold != -1 {
if smartMetadata, ok := metadata.NmveMetadata[sa.AttributeId]; ok {
//check what the ideal is. Ideal tells us if we our recorded value needs to be above, or below the threshold
if (smartMetadata.Ideal == "low" && sa.Value > sa.Threshold) ||
(smartMetadata.Ideal == "high" && sa.Value < sa.Threshold) {
sa.Status = SmartAttributeStatusFailed
sa.StatusReason = "Attribute is failing recommended SMART threshold"
}
}
}
//check if status is blank, set to "passed"
if len(sa.Status) == 0 {
sa.Status = SmartAttributeStatusPassed
}
}

@ -123,7 +123,11 @@ export class DetailComponent implements OnInit, AfterViewInit, OnDestroy {
}
getAttributeIdeal(attribute_data){
if(this.isAta()){
return this.data.metadata[attribute_data.attribute_id]?.display_type == "raw" ? this.data.metadata[attribute_data.attribute_id]?.ideal : ''
} else {
return this.data.metadata[attribute_data.attribute_id]?.ideal
}
}
getAttributeWorst(attribute_data){
@ -131,6 +135,7 @@ export class DetailComponent implements OnInit, AfterViewInit, OnDestroy {
}
getAttributeThreshold(attribute_data){
if(this.isAta()){
if (this.data.metadata[attribute_data.attribute_id]?.display_type == "normalized"){
return attribute_data.thresh
} else {
@ -141,6 +146,9 @@ export class DetailComponent implements OnInit, AfterViewInit, OnDestroy {
// return ''
return attribute_data.thresh
}
} else {
return (attribute_data.thresh == -1 ? '' : attribute_data.thresh )
}
}
getAttributeCritical(attribute_data){
@ -178,10 +186,10 @@ export class DetailComponent implements OnInit, AfterViewInit, OnDestroy {
var latest_smart_result = smart_results[0];
let attributes_list = []
if(this.isScsi()) {
this.smartAttributeTableColumns = ['name', 'value', 'history'];
this.smartAttributeTableColumns = ['status', 'name', 'value', 'thresh', 'history'];
attributes_list = latest_smart_result.scsi_attributes
} else if(this.isNvme()){
this.smartAttributeTableColumns = ['name', 'value', 'history'];
this.smartAttributeTableColumns = ['status', 'name', 'value', 'thresh', 'ideal', 'history'];
attributes_list = latest_smart_result.nvme_attributes
} else {
//ATA

Loading…
Cancel
Save