BUG 分析: 大量 D 进程卡在 shrink_inactive_list 导( 三 )


然而,还是没彻底解决这个问题,所以我们把疑点再次指向 io 。
尝试抓取更多的信息,来了解触发瓶颈的微观过程 。
跑增加 io 使用率、io 读写速度监控,以时间片为 100ms,监控连续 D 状态,并收集 D 进程堆栈信息、内存信息等 。打开的和两个监控点,apk 监控到持续 D 状态就进dump,从 dump 解析,再使用来观察一些数据 。
echo 1 > /sys/kernel/debug/tracing/events/writeback/enableecho 1 > /sys/kernel/debug/tracing/events/vmscan/enableecho 1 > /sys/kernel/debug/tracing/tracing_on
为了准备再深入上述的微观过程,需要再补充一些代码和流程图:
收缩不活跃链表
/** shrink_inactive_list() is a helper for shrink_node().It returns the number* of reclaimed pages*/static noinline_for_stack unsigned longshrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,struct scan_control *sc, enum lru_list lru){LIST_HEAD(page_list);unsigned long nr_scanned;unsigned long nr_reclaimed = 0;unsigned long nr_taken;struct reclaim_stat stat = {};isolate_mode_t isolate_mode = 0;int file = is_file_lru(lru);struct pglist_data *pgdat = lruvec_pgdat(lruvec);struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;bool stalled = false;while (unlikely(too_many_isolated(pgdat, file, sc))) { //如果隔离的页太多就进入睡眠if (stalled)return 0;/* We are about to die and free our memory. Return now. */if (fatal_signal_pending(current))return SWAP_CLUSTER_MAX;/* wait a bit for the reclaimer. */msleep(100);stalled = true;}//将 lru 缓存中的页移到各个 lru 链表中去lru_add_drain();if (!sc->may_unmap)isolate_mode |= ISOLATE_UNMAPPED;spin_lock_irq(&pgdat->lru_lock);//隔离部分 lru 中的页,保存到临时链表 page_list 中nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,&nr_scanned, sc, isolate_mode, lru);//相关统计信息更新__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);reclaim_stat->recent_scanned[file] += nr_taken;if (current_is_kswapd()) {if (global_reclaim(sc))__count_vm_events(PGSCAN_KSWAPD, nr_scanned);count_memcg_events(lruvec_memcg(lruvec), PGSCAN_KSWAPD,nr_scanned);} else {if (global_reclaim(sc))__count_vm_events(PGSCAN_DIRECT, nr_scanned);count_memcg_events(lruvec_memcg(lruvec), PGSCAN_DIRECT,nr_scanned);}spin_unlock_irq(&pgdat->lru_lock);if (nr_taken == 0)return 0;//执行页面回收,待回收的页放在 page_list 中,回收完成之后没有被回收的页也被放在 page_list 中返回nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,&stat, false);spin_lock_irq(&pgdat->lru_lock);if (current_is_kswapd()) {if (global_reclaim(sc))__count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed);count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_KSWAPD,nr_reclaimed);} else {if (global_reclaim(sc))__count_vm_events(PGSTEAL_DIRECT, nr_reclaimed);count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_DIRECT,nr_reclaimed);}//将没有回收的页放回对应链表中,如果页的引用计数为 0 就放到 page_list 中返回putback_inactive_pages(lruvec, &page_list);__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);spin_unlock_irq(&pgdat->lru_lock);mem_cgroup_uncharge_list(&page_list);//释放掉引用计数变为 0 的页free_unref_page_list(&page_list);/** If dirty pages are scanned that are not queued for IO, it* implies that flushers are not doing their job. This can* happen when memory pressure pushes dirty pages to the end of* the LRU before the dirty limits are breached and the dirty* data has expired. It can also happen when the proportion of* dirty pages grows not through writes but through memory* pressure reclaiming all the clean cache. And in some cases,* the flushers simply cannot keep up with the allocation* rate. Nudge the flusher threads in case they are asleep.*/if (stat.nr_unqueued_dirty == nr_taken)wakeup_flusher_threads(WB_REASON_VMSCAN);sc->nr.dirty += stat.nr_dirty;sc->nr.congested += stat.nr_congested;sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;sc->nr.writeback += stat.nr_writeback;sc->nr.immediate += stat.nr_immediate;sc->nr.taken += nr_taken;if (file)sc->nr.file_taken += nr_taken;trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,nr_scanned, nr_reclaimed, &stat, sc->priority, file);return nr_reclaimed;}