[PATCH] mm: microopt conditions

[cascardo/linux.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 104e69c..b0647b5 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -140,18 +140,13 @@ static void bad_page(const char *function, struct page *page)
                         1 << PG_reclaim |
                         1 << PG_slab    |
                         1 << PG_swapcache |
-                       1 << PG_writeback |
-                       1 << PG_reserved );
+                       1 << PG_writeback );
         set_page_count(page, 0);
         reset_page_mapcount(page);
         page->mapping = NULL;
         add_taint(TAINT_BAD_PAGE);
  }
  
-#ifndef CONFIG_HUGETLB_PAGE
-#define prep_compound_page(page, order) do { } while (0)
-#define destroy_compound_page(page, order) do { } while (0)
-#else
  /*
   * Higher-order pages are called "compound pages".  They are structured thusly:
   *
@@ -205,7 +200,6 @@ static void destroy_compound_page(struct page *page, unsigned long order)
                 ClearPageCompound(p);
         }
  }
-#endif         /* CONFIG_HUGETLB_PAGE */
  
  /*
   * function for dealing with page's order in buddy system.
@@ -340,11 +334,11 @@ static inline void __free_pages_bulk (struct page *page,
         zone->free_area[order].nr_free++;
  }
  
-static inline void free_pages_check(const char *function, struct page *page)
+static inline int free_pages_check(const char *function, struct page *page)
  {
-       if (    page_mapcount(page) ||
-               page->mapping != NULL ||
-               page_count(page) != 0 ||
+       if (unlikely(page_mapcount(page) |
+               (page->mapping != NULL)  |
+               (page_count(page) != 0)  |
                 (page->flags & (
                         1 << PG_lru     |
                         1 << PG_private |
@@ -354,10 +348,16 @@ static inline void free_pages_check(const char *function, struct page *page)
                         1 << PG_slab    |
                         1 << PG_swapcache |
                         1 << PG_writeback |
-                       1 << PG_reserved )))
+                       1 << PG_reserved ))))
                 bad_page(function, page);
         if (PageDirty(page))
                 __ClearPageDirty(page);
+       /*
+        * For now, we report if PG_reserved was found set, but do not
+        * clear it, and do not free the page.  But we shall soon need
+        * to do more, for when the ZERO_PAGE count wraps negative.
+        */
+       return PageReserved(page);
  }
  
  /*
@@ -375,11 +375,10 @@ static int
  free_pages_bulk(struct zone *zone, int count,
                 struct list_head *list, unsigned int order)
  {
-       unsigned long flags;
         struct page *page = NULL;
         int ret = 0;
  
-       spin_lock_irqsave(&zone->lock, flags);
+       spin_lock(&zone->lock);
         zone->all_unreclaimable = 0;
         zone->pages_scanned = 0;
         while (!list_empty(list) && count--) {
@@ -389,19 +388,19 @@ free_pages_bulk(struct zone *zone, int count,
                 __free_pages_bulk(page, zone, order);
                 ret++;
         }
-       spin_unlock_irqrestore(&zone->lock, flags);
+       spin_unlock(&zone->lock);
         return ret;
  }
  
  void __free_pages_ok(struct page *page, unsigned int order)
  {
+       unsigned long flags;
         LIST_HEAD(list);
         int i;
+       int reserved = 0;
  
         arch_free_page(page, order);
  
-       mod_page_state(pgfree, 1 << order);
-
  #ifndef CONFIG_MMU
         if (order > 0)
                 for (i = 1 ; i < (1 << order) ; ++i)
@@ -409,10 +408,16 @@ void __free_pages_ok(struct page *page, unsigned int order)
  #endif
  
         for (i = 0 ; i < (1 << order) ; ++i)
-               free_pages_check(__FUNCTION__, page + i);
+               reserved += free_pages_check(__FUNCTION__, page + i);
+       if (reserved)
+               return;
+
         list_add(&page->lru, &list);
+       mod_page_state(pgfree, 1 << order);
         kernel_map_pages(page, 1<<order, 0);
+       local_irq_save(flags);
         free_pages_bulk(page_zone(page), 1, &list, order);
+       local_irq_restore(flags);
  }
  
  
@@ -448,31 +453,14 @@ expand(struct zone *zone, struct page *page,
         return page;
  }
  
-void set_page_refs(struct page *page, int order)
-{
-#ifdef CONFIG_MMU
-       set_page_count(page, 1);
-#else
-       int i;
-
-       /*
-        * We need to reference all the pages for this order, otherwise if
-        * anyone accesses one of the pages with (get/put) it will be freed.
-        * - eg: access_process_vm()
-        */
-       for (i = 0; i < (1 << order); i++)
-               set_page_count(page + i, 1);
-#endif /* CONFIG_MMU */
-}
-
  /*
   * This page is about to be returned from the page allocator
   */
-static void prep_new_page(struct page *page, int order)
+static int prep_new_page(struct page *page, int order)
  {
-       if (    page_mapcount(page) ||
-               page->mapping != NULL ||
-               page_count(page) != 0 ||
+       if (unlikely(page_mapcount(page) |
+               (page->mapping != NULL)  |
+               (page_count(page) != 0)  |
                 (page->flags & (
                         1 << PG_lru     |
                         1 << PG_private |
@@ -483,15 +471,23 @@ static void prep_new_page(struct page *page, int order)
                         1 << PG_slab    |
                         1 << PG_swapcache |
                         1 << PG_writeback |
-                       1 << PG_reserved )))
+                       1 << PG_reserved ))))
                 bad_page(__FUNCTION__, page);
  
+       /*
+        * For now, we report if PG_reserved was found set, but do not
+        * clear it, and do not allocate the page: as a safety net.
+        */
+       if (PageReserved(page))
+               return 1;
+
         page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
                         1 << PG_referenced | 1 << PG_arch_1 |
                         1 << PG_checked | 1 << PG_mappedtodisk);
         set_page_private(page, 0);
         set_page_refs(page, order);
         kernel_map_pages(page, 1 << order, 1);
+       return 0;
  }
  
  /* 
@@ -528,12 +524,11 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
  static int rmqueue_bulk(struct zone *zone, unsigned int order, 
                         unsigned long count, struct list_head *list)
  {
-       unsigned long flags;
         int i;
         int allocated = 0;
         struct page *page;
         
-       spin_lock_irqsave(&zone->lock, flags);
+       spin_lock(&zone->lock);
         for (i = 0; i < count; ++i) {
                 page = __rmqueue(zone, order);
                 if (page == NULL)
@@ -541,7 +536,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
                 allocated++;
                 list_add_tail(&page->lru, list);
         }
-       spin_unlock_irqrestore(&zone->lock, flags);
+       spin_unlock(&zone->lock);
         return allocated;
  }
  
@@ -578,6 +573,7 @@ void drain_remote_pages(void)
  #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
  static void __drain_pages(unsigned int cpu)
  {
+       unsigned long flags;
         struct zone *zone;
         int i;
  
@@ -589,8 +585,10 @@ static void __drain_pages(unsigned int cpu)
                         struct per_cpu_pages *pcp;
  
                         pcp = &pset->pcp[i];
+                       local_irq_save(flags);
                         pcp->count -= free_pages_bulk(zone, pcp->count,
                                                 &pcp->list, 0);
+                       local_irq_restore(flags);
                 }
         }
  }
@@ -674,11 +672,14 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
  
         arch_free_page(page, 0);
  
-       kernel_map_pages(page, 1, 0);
-       inc_page_state(pgfree);
         if (PageAnon(page))
                 page->mapping = NULL;
-       free_pages_check(__FUNCTION__, page);
+       if (free_pages_check(__FUNCTION__, page))
+               return;
+
+       inc_page_state(pgfree);
+       kernel_map_pages(page, 1, 0);
+
         pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
         local_irq_save(flags);
         list_add(&page->lru, &pcp->list);
@@ -717,18 +718,20 @@ static struct page *
  buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
  {
         unsigned long flags;
-       struct page *page = NULL;
+       struct page *page;
         int cold = !!(gfp_flags & __GFP_COLD);
  
+again:
         if (order == 0) {
                 struct per_cpu_pages *pcp;
  
+               page = NULL;
                 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
                 local_irq_save(flags);
                 if (pcp->count <= pcp->low)
                         pcp->count += rmqueue_bulk(zone, 0,
                                                 pcp->batch, &pcp->list);
-               if (pcp->count) {
+               if (likely(pcp->count)) {
                         page = list_entry(pcp->list.next, struct page, lru);
                         list_del(&page->lru);
                         pcp->count--;
@@ -744,7 +747,8 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
         if (page != NULL) {
                 BUG_ON(bad_range(zone, page));
                 mod_page_state_zone(zone, pgalloc, 1 << order);
-               prep_new_page(page, order);
+               if (prep_new_page(page, order))
+                       goto again;
  
                 if (gfp_flags & __GFP_ZERO)
                         prep_zero_page(page, order, gfp_flags);
@@ -756,9 +760,12 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
  }
  
  #define ALLOC_NO_WATERMARKS    0x01 /* don't check watermarks at all */
-#define ALLOC_HARDER           0x02 /* try to alloc harder */
-#define ALLOC_HIGH             0x04 /* __GFP_HIGH set */
-#define ALLOC_CPUSET           0x08 /* check for correct cpuset */
+#define ALLOC_WMARK_MIN                0x02 /* use pages_min watermark */
+#define ALLOC_WMARK_LOW                0x04 /* use pages_low watermark */
+#define ALLOC_WMARK_HIGH       0x08 /* use pages_high watermark */
+#define ALLOC_HARDER           0x10 /* try to alloc harder */
+#define ALLOC_HIGH             0x20 /* __GFP_HIGH set */
+#define ALLOC_CPUSET           0x40 /* check for correct cpuset */
  
  /*
   * Return 1 if free pages are above 'mark'. This takes into account the order
@@ -813,7 +820,14 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
                         continue;
  
                 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
-                       if (!zone_watermark_ok(*z, order, (*z)->pages_low,
+                       unsigned long mark;
+                       if (alloc_flags & ALLOC_WMARK_MIN)
+                               mark = (*z)->pages_min;
+                       else if (alloc_flags & ALLOC_WMARK_LOW)
+                               mark = (*z)->pages_low;
+                       else
+                               mark = (*z)->pages_high;
+                       if (!zone_watermark_ok(*z, order, mark,
                                     classzone_idx, alloc_flags))
                                 continue;
                 }
@@ -845,21 +859,22 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
  
         might_sleep_if(wait);
  
+restart:
         z = zonelist->zones;  /* the list of zones suitable for gfp_mask */
  
         if (unlikely(*z == NULL)) {
                 /* Should this ever happen?? */
                 return NULL;
         }
-restart:
+
         page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
-                               zonelist, ALLOC_CPUSET);
+                               zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
         if (page)
                 goto got_pg;
  
-       do
+       do {
                 wakeup_kswapd(*z, order);
-       while (*(++z));
+       } while (*(++z));
  
         /*
          * OK, we're below the kswapd watermark and have kicked background
@@ -870,13 +885,12 @@ restart:
          * cannot run direct reclaim, or if the caller has realtime scheduling
          * policy.
          */
-       alloc_flags = 0;
+       alloc_flags = ALLOC_WMARK_MIN;
         if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
                 alloc_flags |= ALLOC_HARDER;
         if (gfp_mask & __GFP_HIGH)
                 alloc_flags |= ALLOC_HIGH;
-       if (wait)
-               alloc_flags |= ALLOC_CPUSET;
+       alloc_flags |= ALLOC_CPUSET;
  
         /*
          * Go through the zonelist again. Let __GFP_HIGH and allocations
@@ -898,7 +912,7 @@ restart:
  nofail_alloc:
                         /* go through the zonelist yet again, ignoring mins */
                         page = get_page_from_freelist(gfp_mask, order,
-                               zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET);
+                               zonelist, ALLOC_NO_WATERMARKS);
                         if (page)
                                 goto got_pg;
                         if (gfp_mask & __GFP_NOFAIL) {
@@ -941,7 +955,7 @@ rebalance:
                  * under heavy pressure.
                  */
                 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
-                                               zonelist, ALLOC_CPUSET);
+                               zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
                 if (page)
                         goto got_pg;
  
@@ -1681,8 +1695,6 @@ void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
         for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
                 if (!early_pfn_valid(pfn))
                         continue;
-               if (!early_pfn_in_nid(pfn, nid))
-                       continue;
                 page = pfn_to_page(pfn);
                 set_page_links(page, zone, nid, pfn);
                 set_page_count(page, 1);
@@ -1744,16 +1756,16 @@ static int __devinit zone_batchsize(struct zone *zone)
                 batch = 1;
  
         /*
-        * We will be trying to allcoate bigger chunks of contiguous
-        * memory of the order of fls(batch).  This should result in
-        * better cache coloring.
+        * Clamp the batch to a 2^n - 1 value. Having a power
+        * of 2 value was found to be more likely to have
+        * suboptimal cache aliasing properties in some cases.
          *
-        * A sanity check also to ensure that batch is still in limits.
+        * For example if 2 tasks are alternately allocating
+        * batches of pages, one task can end up with a lot
+        * of pages of one half of the possible page colors
+        * and the other with pages of the other colors.
          */
-       batch = (1 << fls(batch + batch/2));
-
-       if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2))
-               batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2);
+       batch = (1 << (fls(batch + batch/2)-1)) - 1;
  
         return batch;
  }
@@ -1868,7 +1880,7 @@ static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
  static struct notifier_block pageset_notifier =
         { &pageset_cpuup_callback, NULL, 0 };
  
-void __init setup_per_cpu_pageset()
+void __init setup_per_cpu_pageset(void)
  {
         int err;