Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Hwmon client for disk and solid state drives with temperature sensors
0004  * Copyright (C) 2019 Zodiac Inflight Innovations
0005  *
0006  * With input from:
0007  *    Hwmon client for S.M.A.R.T. hard disk drives with temperature sensors.
0008  *    (C) 2018 Linus Walleij
0009  *
0010  *    hwmon: Driver for SCSI/ATA temperature sensors
0011  *    by Constantin Baranov <const@mimas.ru>, submitted September 2009
0012  *
0013  * This drive supports reporting the temperature of SATA drives. It can be
0014  * easily extended to report the temperature of SCSI drives.
0015  *
0016  * The primary means to read drive temperatures and temperature limits
0017  * for ATA drives is the SCT Command Transport feature set as specified in
0018  * ATA8-ACS.
0019  * It can be used to read the current drive temperature, temperature limits,
0020  * and historic minimum and maximum temperatures. The SCT Command Transport
0021  * feature set is documented in "AT Attachment 8 - ATA/ATAPI Command Set
0022  * (ATA8-ACS)".
0023  *
0024  * If the SCT Command Transport feature set is not available, drive temperatures
0025  * may be readable through SMART attributes. Since SMART attributes are not well
0026  * defined, this method is only used as fallback mechanism.
0027  *
0028  * There are three SMART attributes which may report drive temperatures.
0029  * Those are defined as follows (from
0030  * http://www.cropel.com/library/smart-attribute-list.aspx).
0031  *
0032  * 190  Temperature Temperature, monitored by a sensor somewhere inside
0033  *          the drive. Raw value typicaly holds the actual
0034  *          temperature (hexadecimal) in its rightmost two digits.
0035  *
0036  * 194  Temperature Temperature, monitored by a sensor somewhere inside
0037  *          the drive. Raw value typicaly holds the actual
0038  *          temperature (hexadecimal) in its rightmost two digits.
0039  *
0040  * 231  Temperature Temperature, monitored by a sensor somewhere inside
0041  *          the drive. Raw value typicaly holds the actual
0042  *          temperature (hexadecimal) in its rightmost two digits.
0043  *
0044  * Wikipedia defines attributes a bit differently.
0045  *
0046  * 190  Temperature Value is equal to (100-temp. °C), allowing manufacturer
0047  *  Difference or   to set a minimum threshold which corresponds to a
0048  *  Airflow     maximum temperature. This also follows the convention of
0049  *  Temperature 100 being a best-case value and lower values being
0050  *          undesirable. However, some older drives may instead
0051  *          report raw Temperature (identical to 0xC2) or
0052  *          Temperature minus 50 here.
0053  * 194  Temperature or  Indicates the device temperature, if the appropriate
0054  *  Temperature sensor is fitted. Lowest byte of the raw value contains
0055  *  Celsius     the exact temperature value (Celsius degrees).
0056  * 231  Life Left   Indicates the approximate SSD life left, in terms of
0057  *  (SSDs) or   program/erase cycles or available reserved blocks.
0058  *  Temperature A normalized value of 100 represents a new drive, with
0059  *          a threshold value at 10 indicating a need for
0060  *          replacement. A value of 0 may mean that the drive is
0061  *          operating in read-only mode to allow data recovery.
0062  *          Previously (pre-2010) occasionally used for Drive
0063  *          Temperature (more typically reported at 0xC2).
0064  *
0065  * Common denominator is that the first raw byte reports the temperature
0066  * in degrees C on almost all drives. Some drives may report a fractional
0067  * temperature in the second raw byte.
0068  *
0069  * Known exceptions (from libatasmart):
0070  * - SAMSUNG SV0412H and SAMSUNG SV1204H) report the temperature in 10th
0071  *   degrees C in the first two raw bytes.
0072  * - A few Maxtor drives report an unknown or bad value in attribute 194.
0073  * - Certain Apple SSD drives report an unknown value in attribute 190.
0074  *   Only certain firmware versions are affected.
0075  *
0076  * Those exceptions affect older ATA drives and are currently ignored.
0077  * Also, the second raw byte (possibly reporting the fractional temperature)
0078  * is currently ignored.
0079  *
0080  * Many drives also report temperature limits in additional SMART data raw
0081  * bytes. The format of those is not well defined and varies widely.
0082  * The driver does not currently attempt to report those limits.
0083  *
0084  * According to data in smartmontools, attribute 231 is rarely used to report
0085  * drive temperatures. At the same time, several drives report SSD life left
0086  * in attribute 231, but do not support temperature sensors. For this reason,
0087  * attribute 231 is currently ignored.
0088  *
0089  * Following above definitions, temperatures are reported as follows.
0090  *   If SCT Command Transport is supported, it is used to read the
0091  *   temperature and, if available, temperature limits.
0092  * - Otherwise, if SMART attribute 194 is supported, it is used to read
0093  *   the temperature.
0094  * - Otherwise, if SMART attribute 190 is supported, it is used to read
0095  *   the temperature.
0096  */
0097 
0098 #include <linux/ata.h>
0099 #include <linux/bits.h>
0100 #include <linux/device.h>
0101 #include <linux/hwmon.h>
0102 #include <linux/kernel.h>
0103 #include <linux/list.h>
0104 #include <linux/module.h>
0105 #include <linux/mutex.h>
0106 #include <scsi/scsi_cmnd.h>
0107 #include <scsi/scsi_device.h>
0108 #include <scsi/scsi_driver.h>
0109 #include <scsi/scsi_proto.h>
0110 
0111 struct drivetemp_data {
0112     struct list_head list;      /* list of instantiated devices */
0113     struct mutex lock;      /* protect data buffer accesses */
0114     struct scsi_device *sdev;   /* SCSI device */
0115     struct device *dev;     /* instantiating device */
0116     struct device *hwdev;       /* hardware monitoring device */
0117     u8 smartdata[ATA_SECT_SIZE];    /* local buffer */
0118     int (*get_temp)(struct drivetemp_data *st, u32 attr, long *val);
0119     bool have_temp_lowest;      /* lowest temp in SCT status */
0120     bool have_temp_highest;     /* highest temp in SCT status */
0121     bool have_temp_min;     /* have min temp */
0122     bool have_temp_max;     /* have max temp */
0123     bool have_temp_lcrit;       /* have lower critical limit */
0124     bool have_temp_crit;        /* have critical limit */
0125     int temp_min;           /* min temp */
0126     int temp_max;           /* max temp */
0127     int temp_lcrit;         /* lower critical limit */
0128     int temp_crit;          /* critical limit */
0129 };
0130 
0131 static LIST_HEAD(drivetemp_devlist);
0132 
0133 #define ATA_MAX_SMART_ATTRS 30
0134 #define SMART_TEMP_PROP_190 190
0135 #define SMART_TEMP_PROP_194 194
0136 
0137 #define SCT_STATUS_REQ_ADDR 0xe0
0138 #define  SCT_STATUS_VERSION_LOW     0   /* log byte offsets */
0139 #define  SCT_STATUS_VERSION_HIGH    1
0140 #define  SCT_STATUS_TEMP        200
0141 #define  SCT_STATUS_TEMP_LOWEST     201
0142 #define  SCT_STATUS_TEMP_HIGHEST    202
0143 #define SCT_READ_LOG_ADDR   0xe1
0144 #define  SMART_READ_LOG         0xd5
0145 #define  SMART_WRITE_LOG        0xd6
0146 
0147 #define INVALID_TEMP        0x80
0148 
0149 #define temp_is_valid(temp) ((temp) != INVALID_TEMP)
0150 #define temp_from_sct(temp) (((s8)(temp)) * 1000)
0151 
0152 static inline bool ata_id_smart_supported(u16 *id)
0153 {
0154     return id[ATA_ID_COMMAND_SET_1] & BIT(0);
0155 }
0156 
0157 static inline bool ata_id_smart_enabled(u16 *id)
0158 {
0159     return id[ATA_ID_CFS_ENABLE_1] & BIT(0);
0160 }
0161 
0162 static int drivetemp_scsi_command(struct drivetemp_data *st,
0163                  u8 ata_command, u8 feature,
0164                  u8 lba_low, u8 lba_mid, u8 lba_high)
0165 {
0166     u8 scsi_cmd[MAX_COMMAND_SIZE];
0167     int data_dir;
0168 
0169     memset(scsi_cmd, 0, sizeof(scsi_cmd));
0170     scsi_cmd[0] = ATA_16;
0171     if (ata_command == ATA_CMD_SMART && feature == SMART_WRITE_LOG) {
0172         scsi_cmd[1] = (5 << 1); /* PIO Data-out */
0173         /*
0174          * No off.line or cc, write to dev, block count in sector count
0175          * field.
0176          */
0177         scsi_cmd[2] = 0x06;
0178         data_dir = DMA_TO_DEVICE;
0179     } else {
0180         scsi_cmd[1] = (4 << 1); /* PIO Data-in */
0181         /*
0182          * No off.line or cc, read from dev, block count in sector count
0183          * field.
0184          */
0185         scsi_cmd[2] = 0x0e;
0186         data_dir = DMA_FROM_DEVICE;
0187     }
0188     scsi_cmd[4] = feature;
0189     scsi_cmd[6] = 1;    /* 1 sector */
0190     scsi_cmd[8] = lba_low;
0191     scsi_cmd[10] = lba_mid;
0192     scsi_cmd[12] = lba_high;
0193     scsi_cmd[14] = ata_command;
0194 
0195     return scsi_execute_req(st->sdev, scsi_cmd, data_dir,
0196                 st->smartdata, ATA_SECT_SIZE, NULL, HZ, 5,
0197                 NULL);
0198 }
0199 
0200 static int drivetemp_ata_command(struct drivetemp_data *st, u8 feature,
0201                  u8 select)
0202 {
0203     return drivetemp_scsi_command(st, ATA_CMD_SMART, feature, select,
0204                      ATA_SMART_LBAM_PASS, ATA_SMART_LBAH_PASS);
0205 }
0206 
0207 static int drivetemp_get_smarttemp(struct drivetemp_data *st, u32 attr,
0208                   long *temp)
0209 {
0210     u8 *buf = st->smartdata;
0211     bool have_temp = false;
0212     u8 temp_raw;
0213     u8 csum;
0214     int err;
0215     int i;
0216 
0217     err = drivetemp_ata_command(st, ATA_SMART_READ_VALUES, 0);
0218     if (err)
0219         return err;
0220 
0221     /* Checksum the read value table */
0222     csum = 0;
0223     for (i = 0; i < ATA_SECT_SIZE; i++)
0224         csum += buf[i];
0225     if (csum) {
0226         dev_dbg(&st->sdev->sdev_gendev,
0227             "checksum error reading SMART values\n");
0228         return -EIO;
0229     }
0230 
0231     for (i = 0; i < ATA_MAX_SMART_ATTRS; i++) {
0232         u8 *attr = buf + i * 12;
0233         int id = attr[2];
0234 
0235         if (!id)
0236             continue;
0237 
0238         if (id == SMART_TEMP_PROP_190) {
0239             temp_raw = attr[7];
0240             have_temp = true;
0241         }
0242         if (id == SMART_TEMP_PROP_194) {
0243             temp_raw = attr[7];
0244             have_temp = true;
0245             break;
0246         }
0247     }
0248 
0249     if (have_temp) {
0250         *temp = temp_raw * 1000;
0251         return 0;
0252     }
0253 
0254     return -ENXIO;
0255 }
0256 
0257 static int drivetemp_get_scttemp(struct drivetemp_data *st, u32 attr, long *val)
0258 {
0259     u8 *buf = st->smartdata;
0260     int err;
0261 
0262     err = drivetemp_ata_command(st, SMART_READ_LOG, SCT_STATUS_REQ_ADDR);
0263     if (err)
0264         return err;
0265     switch (attr) {
0266     case hwmon_temp_input:
0267         if (!temp_is_valid(buf[SCT_STATUS_TEMP]))
0268             return -ENODATA;
0269         *val = temp_from_sct(buf[SCT_STATUS_TEMP]);
0270         break;
0271     case hwmon_temp_lowest:
0272         if (!temp_is_valid(buf[SCT_STATUS_TEMP_LOWEST]))
0273             return -ENODATA;
0274         *val = temp_from_sct(buf[SCT_STATUS_TEMP_LOWEST]);
0275         break;
0276     case hwmon_temp_highest:
0277         if (!temp_is_valid(buf[SCT_STATUS_TEMP_HIGHEST]))
0278             return -ENODATA;
0279         *val = temp_from_sct(buf[SCT_STATUS_TEMP_HIGHEST]);
0280         break;
0281     default:
0282         err = -EINVAL;
0283         break;
0284     }
0285     return err;
0286 }
0287 
0288 static const char * const sct_avoid_models[] = {
0289 /*
0290  * These drives will have WRITE FPDMA QUEUED command timeouts and sometimes just
0291  * freeze until power-cycled under heavy write loads when their temperature is
0292  * getting polled in SCT mode. The SMART mode seems to be fine, though.
0293  *
0294  * While only the 3 TB model (DT01ACA3) was actually caught exhibiting the
0295  * problem let's play safe here to avoid data corruption and ban the whole
0296  * DT01ACAx family.
0297 
0298  * The models from this array are prefix-matched.
0299  */
0300     "TOSHIBA DT01ACA",
0301 };
0302 
0303 static bool drivetemp_sct_avoid(struct drivetemp_data *st)
0304 {
0305     struct scsi_device *sdev = st->sdev;
0306     unsigned int ctr;
0307 
0308     if (!sdev->model)
0309         return false;
0310 
0311     /*
0312      * The "model" field contains just the raw SCSI INQUIRY response
0313      * "product identification" field, which has a width of 16 bytes.
0314      * This field is space-filled, but is NOT NULL-terminated.
0315      */
0316     for (ctr = 0; ctr < ARRAY_SIZE(sct_avoid_models); ctr++)
0317         if (!strncmp(sdev->model, sct_avoid_models[ctr],
0318                  strlen(sct_avoid_models[ctr])))
0319             return true;
0320 
0321     return false;
0322 }
0323 
0324 static int drivetemp_identify_sata(struct drivetemp_data *st)
0325 {
0326     struct scsi_device *sdev = st->sdev;
0327     u8 *buf = st->smartdata;
0328     struct scsi_vpd *vpd;
0329     bool is_ata, is_sata;
0330     bool have_sct_data_table;
0331     bool have_sct_temp;
0332     bool have_smart;
0333     bool have_sct;
0334     u16 *ata_id;
0335     u16 version;
0336     long temp;
0337     int err;
0338 
0339     /* SCSI-ATA Translation present? */
0340     rcu_read_lock();
0341     vpd = rcu_dereference(sdev->vpd_pg89);
0342 
0343     /*
0344      * Verify that ATA IDENTIFY DEVICE data is included in ATA Information
0345      * VPD and that the drive implements the SATA protocol.
0346      */
0347     if (!vpd || vpd->len < 572 || vpd->data[56] != ATA_CMD_ID_ATA ||
0348         vpd->data[36] != 0x34) {
0349         rcu_read_unlock();
0350         return -ENODEV;
0351     }
0352     ata_id = (u16 *)&vpd->data[60];
0353     is_ata = ata_id_is_ata(ata_id);
0354     is_sata = ata_id_is_sata(ata_id);
0355     have_sct = ata_id_sct_supported(ata_id);
0356     have_sct_data_table = ata_id_sct_data_tables(ata_id);
0357     have_smart = ata_id_smart_supported(ata_id) &&
0358                 ata_id_smart_enabled(ata_id);
0359 
0360     rcu_read_unlock();
0361 
0362     /* bail out if this is not a SATA device */
0363     if (!is_ata || !is_sata)
0364         return -ENODEV;
0365 
0366     if (have_sct && drivetemp_sct_avoid(st)) {
0367         dev_notice(&sdev->sdev_gendev,
0368                "will avoid using SCT for temperature monitoring\n");
0369         have_sct = false;
0370     }
0371 
0372     if (!have_sct)
0373         goto skip_sct;
0374 
0375     err = drivetemp_ata_command(st, SMART_READ_LOG, SCT_STATUS_REQ_ADDR);
0376     if (err)
0377         goto skip_sct;
0378 
0379     version = (buf[SCT_STATUS_VERSION_HIGH] << 8) |
0380           buf[SCT_STATUS_VERSION_LOW];
0381     if (version != 2 && version != 3)
0382         goto skip_sct;
0383 
0384     have_sct_temp = temp_is_valid(buf[SCT_STATUS_TEMP]);
0385     if (!have_sct_temp)
0386         goto skip_sct;
0387 
0388     st->have_temp_lowest = temp_is_valid(buf[SCT_STATUS_TEMP_LOWEST]);
0389     st->have_temp_highest = temp_is_valid(buf[SCT_STATUS_TEMP_HIGHEST]);
0390 
0391     if (!have_sct_data_table)
0392         goto skip_sct_data;
0393 
0394     /* Request and read temperature history table */
0395     memset(buf, '\0', sizeof(st->smartdata));
0396     buf[0] = 5; /* data table command */
0397     buf[2] = 1; /* read table */
0398     buf[4] = 2; /* temperature history table */
0399 
0400     err = drivetemp_ata_command(st, SMART_WRITE_LOG, SCT_STATUS_REQ_ADDR);
0401     if (err)
0402         goto skip_sct_data;
0403 
0404     err = drivetemp_ata_command(st, SMART_READ_LOG, SCT_READ_LOG_ADDR);
0405     if (err)
0406         goto skip_sct_data;
0407 
0408     /*
0409      * Temperature limits per AT Attachment 8 -
0410      * ATA/ATAPI Command Set (ATA8-ACS)
0411      */
0412     st->have_temp_max = temp_is_valid(buf[6]);
0413     st->have_temp_crit = temp_is_valid(buf[7]);
0414     st->have_temp_min = temp_is_valid(buf[8]);
0415     st->have_temp_lcrit = temp_is_valid(buf[9]);
0416 
0417     st->temp_max = temp_from_sct(buf[6]);
0418     st->temp_crit = temp_from_sct(buf[7]);
0419     st->temp_min = temp_from_sct(buf[8]);
0420     st->temp_lcrit = temp_from_sct(buf[9]);
0421 
0422 skip_sct_data:
0423     if (have_sct_temp) {
0424         st->get_temp = drivetemp_get_scttemp;
0425         return 0;
0426     }
0427 skip_sct:
0428     if (!have_smart)
0429         return -ENODEV;
0430     st->get_temp = drivetemp_get_smarttemp;
0431     return drivetemp_get_smarttemp(st, hwmon_temp_input, &temp);
0432 }
0433 
0434 static int drivetemp_identify(struct drivetemp_data *st)
0435 {
0436     struct scsi_device *sdev = st->sdev;
0437 
0438     /* Bail out immediately if there is no inquiry data */
0439     if (!sdev->inquiry || sdev->inquiry_len < 16)
0440         return -ENODEV;
0441 
0442     /* Disk device? */
0443     if (sdev->type != TYPE_DISK && sdev->type != TYPE_ZBC)
0444         return -ENODEV;
0445 
0446     return drivetemp_identify_sata(st);
0447 }
0448 
0449 static int drivetemp_read(struct device *dev, enum hwmon_sensor_types type,
0450              u32 attr, int channel, long *val)
0451 {
0452     struct drivetemp_data *st = dev_get_drvdata(dev);
0453     int err = 0;
0454 
0455     if (type != hwmon_temp)
0456         return -EINVAL;
0457 
0458     switch (attr) {
0459     case hwmon_temp_input:
0460     case hwmon_temp_lowest:
0461     case hwmon_temp_highest:
0462         mutex_lock(&st->lock);
0463         err = st->get_temp(st, attr, val);
0464         mutex_unlock(&st->lock);
0465         break;
0466     case hwmon_temp_lcrit:
0467         *val = st->temp_lcrit;
0468         break;
0469     case hwmon_temp_min:
0470         *val = st->temp_min;
0471         break;
0472     case hwmon_temp_max:
0473         *val = st->temp_max;
0474         break;
0475     case hwmon_temp_crit:
0476         *val = st->temp_crit;
0477         break;
0478     default:
0479         err = -EINVAL;
0480         break;
0481     }
0482     return err;
0483 }
0484 
0485 static umode_t drivetemp_is_visible(const void *data,
0486                    enum hwmon_sensor_types type,
0487                    u32 attr, int channel)
0488 {
0489     const struct drivetemp_data *st = data;
0490 
0491     switch (type) {
0492     case hwmon_temp:
0493         switch (attr) {
0494         case hwmon_temp_input:
0495             return 0444;
0496         case hwmon_temp_lowest:
0497             if (st->have_temp_lowest)
0498                 return 0444;
0499             break;
0500         case hwmon_temp_highest:
0501             if (st->have_temp_highest)
0502                 return 0444;
0503             break;
0504         case hwmon_temp_min:
0505             if (st->have_temp_min)
0506                 return 0444;
0507             break;
0508         case hwmon_temp_max:
0509             if (st->have_temp_max)
0510                 return 0444;
0511             break;
0512         case hwmon_temp_lcrit:
0513             if (st->have_temp_lcrit)
0514                 return 0444;
0515             break;
0516         case hwmon_temp_crit:
0517             if (st->have_temp_crit)
0518                 return 0444;
0519             break;
0520         default:
0521             break;
0522         }
0523         break;
0524     default:
0525         break;
0526     }
0527     return 0;
0528 }
0529 
0530 static const struct hwmon_channel_info *drivetemp_info[] = {
0531     HWMON_CHANNEL_INFO(chip,
0532                HWMON_C_REGISTER_TZ),
0533     HWMON_CHANNEL_INFO(temp, HWMON_T_INPUT |
0534                HWMON_T_LOWEST | HWMON_T_HIGHEST |
0535                HWMON_T_MIN | HWMON_T_MAX |
0536                HWMON_T_LCRIT | HWMON_T_CRIT),
0537     NULL
0538 };
0539 
0540 static const struct hwmon_ops drivetemp_ops = {
0541     .is_visible = drivetemp_is_visible,
0542     .read = drivetemp_read,
0543 };
0544 
0545 static const struct hwmon_chip_info drivetemp_chip_info = {
0546     .ops = &drivetemp_ops,
0547     .info = drivetemp_info,
0548 };
0549 
0550 /*
0551  * The device argument points to sdev->sdev_dev. Its parent is
0552  * sdev->sdev_gendev, which we can use to get the scsi_device pointer.
0553  */
0554 static int drivetemp_add(struct device *dev, struct class_interface *intf)
0555 {
0556     struct scsi_device *sdev = to_scsi_device(dev->parent);
0557     struct drivetemp_data *st;
0558     int err;
0559 
0560     st = kzalloc(sizeof(*st), GFP_KERNEL);
0561     if (!st)
0562         return -ENOMEM;
0563 
0564     st->sdev = sdev;
0565     st->dev = dev;
0566     mutex_init(&st->lock);
0567 
0568     if (drivetemp_identify(st)) {
0569         err = -ENODEV;
0570         goto abort;
0571     }
0572 
0573     st->hwdev = hwmon_device_register_with_info(dev->parent, "drivetemp",
0574                             st, &drivetemp_chip_info,
0575                             NULL);
0576     if (IS_ERR(st->hwdev)) {
0577         err = PTR_ERR(st->hwdev);
0578         goto abort;
0579     }
0580 
0581     list_add(&st->list, &drivetemp_devlist);
0582     return 0;
0583 
0584 abort:
0585     kfree(st);
0586     return err;
0587 }
0588 
0589 static void drivetemp_remove(struct device *dev, struct class_interface *intf)
0590 {
0591     struct drivetemp_data *st, *tmp;
0592 
0593     list_for_each_entry_safe(st, tmp, &drivetemp_devlist, list) {
0594         if (st->dev == dev) {
0595             list_del(&st->list);
0596             hwmon_device_unregister(st->hwdev);
0597             kfree(st);
0598             break;
0599         }
0600     }
0601 }
0602 
0603 static struct class_interface drivetemp_interface = {
0604     .add_dev = drivetemp_add,
0605     .remove_dev = drivetemp_remove,
0606 };
0607 
0608 static int __init drivetemp_init(void)
0609 {
0610     return scsi_register_interface(&drivetemp_interface);
0611 }
0612 
0613 static void __exit drivetemp_exit(void)
0614 {
0615     scsi_unregister_interface(&drivetemp_interface);
0616 }
0617 
0618 module_init(drivetemp_init);
0619 module_exit(drivetemp_exit);
0620 
0621 MODULE_AUTHOR("Guenter Roeck <linus@roeck-us.net>");
0622 MODULE_DESCRIPTION("Hard drive temperature monitor");
0623 MODULE_LICENSE("GPL");
0624 MODULE_ALIAS("platform:drivetemp");