drm/i915/skl: Add WaDisableGafsUnitClkGating
[cascardo/linux.git] / drivers / gpu / drm / i915 / intel_ringbuffer.c
1 /*
2  * Copyright © 2008-2010 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Eric Anholt <eric@anholt.net>
25  *    Zou Nan hai <nanhai.zou@intel.com>
26  *    Xiang Hai hao<haihao.xiang@intel.com>
27  *
28  */
29
30 #include <linux/log2.h>
31 #include <drm/drmP.h>
32 #include "i915_drv.h"
33 #include <drm/i915_drm.h>
34 #include "i915_trace.h"
35 #include "intel_drv.h"
36
37 /* Rough estimate of the typical request size, performing a flush,
38  * set-context and then emitting the batch.
39  */
40 #define LEGACY_REQUEST_SIZE 200
41
42 int __intel_ring_space(int head, int tail, int size)
43 {
44         int space = head - tail;
45         if (space <= 0)
46                 space += size;
47         return space - I915_RING_FREE_SPACE;
48 }
49
50 void intel_ring_update_space(struct intel_ringbuffer *ringbuf)
51 {
52         if (ringbuf->last_retired_head != -1) {
53                 ringbuf->head = ringbuf->last_retired_head;
54                 ringbuf->last_retired_head = -1;
55         }
56
57         ringbuf->space = __intel_ring_space(ringbuf->head & HEAD_ADDR,
58                                             ringbuf->tail, ringbuf->size);
59 }
60
61 bool intel_engine_stopped(struct intel_engine_cs *engine)
62 {
63         struct drm_i915_private *dev_priv = engine->i915;
64         return dev_priv->gpu_error.stop_rings & intel_engine_flag(engine);
65 }
66
67 static void __intel_ring_advance(struct intel_engine_cs *engine)
68 {
69         struct intel_ringbuffer *ringbuf = engine->buffer;
70         ringbuf->tail &= ringbuf->size - 1;
71         if (intel_engine_stopped(engine))
72                 return;
73         engine->write_tail(engine, ringbuf->tail);
74 }
75
76 static int
77 gen2_render_ring_flush(struct drm_i915_gem_request *req,
78                        u32      invalidate_domains,
79                        u32      flush_domains)
80 {
81         struct intel_engine_cs *engine = req->engine;
82         u32 cmd;
83         int ret;
84
85         cmd = MI_FLUSH;
86         if (((invalidate_domains|flush_domains) & I915_GEM_DOMAIN_RENDER) == 0)
87                 cmd |= MI_NO_WRITE_FLUSH;
88
89         if (invalidate_domains & I915_GEM_DOMAIN_SAMPLER)
90                 cmd |= MI_READ_FLUSH;
91
92         ret = intel_ring_begin(req, 2);
93         if (ret)
94                 return ret;
95
96         intel_ring_emit(engine, cmd);
97         intel_ring_emit(engine, MI_NOOP);
98         intel_ring_advance(engine);
99
100         return 0;
101 }
102
103 static int
104 gen4_render_ring_flush(struct drm_i915_gem_request *req,
105                        u32      invalidate_domains,
106                        u32      flush_domains)
107 {
108         struct intel_engine_cs *engine = req->engine;
109         u32 cmd;
110         int ret;
111
112         /*
113          * read/write caches:
114          *
115          * I915_GEM_DOMAIN_RENDER is always invalidated, but is
116          * only flushed if MI_NO_WRITE_FLUSH is unset.  On 965, it is
117          * also flushed at 2d versus 3d pipeline switches.
118          *
119          * read-only caches:
120          *
121          * I915_GEM_DOMAIN_SAMPLER is flushed on pre-965 if
122          * MI_READ_FLUSH is set, and is always flushed on 965.
123          *
124          * I915_GEM_DOMAIN_COMMAND may not exist?
125          *
126          * I915_GEM_DOMAIN_INSTRUCTION, which exists on 965, is
127          * invalidated when MI_EXE_FLUSH is set.
128          *
129          * I915_GEM_DOMAIN_VERTEX, which exists on 965, is
130          * invalidated with every MI_FLUSH.
131          *
132          * TLBs:
133          *
134          * On 965, TLBs associated with I915_GEM_DOMAIN_COMMAND
135          * and I915_GEM_DOMAIN_CPU in are invalidated at PTE write and
136          * I915_GEM_DOMAIN_RENDER and I915_GEM_DOMAIN_SAMPLER
137          * are flushed at any MI_FLUSH.
138          */
139
140         cmd = MI_FLUSH | MI_NO_WRITE_FLUSH;
141         if ((invalidate_domains|flush_domains) & I915_GEM_DOMAIN_RENDER)
142                 cmd &= ~MI_NO_WRITE_FLUSH;
143         if (invalidate_domains & I915_GEM_DOMAIN_INSTRUCTION)
144                 cmd |= MI_EXE_FLUSH;
145
146         if (invalidate_domains & I915_GEM_DOMAIN_COMMAND &&
147             (IS_G4X(req->i915) || IS_GEN5(req->i915)))
148                 cmd |= MI_INVALIDATE_ISP;
149
150         ret = intel_ring_begin(req, 2);
151         if (ret)
152                 return ret;
153
154         intel_ring_emit(engine, cmd);
155         intel_ring_emit(engine, MI_NOOP);
156         intel_ring_advance(engine);
157
158         return 0;
159 }
160
161 /**
162  * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
163  * implementing two workarounds on gen6.  From section 1.4.7.1
164  * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
165  *
166  * [DevSNB-C+{W/A}] Before any depth stall flush (including those
167  * produced by non-pipelined state commands), software needs to first
168  * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
169  * 0.
170  *
171  * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
172  * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
173  *
174  * And the workaround for these two requires this workaround first:
175  *
176  * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
177  * BEFORE the pipe-control with a post-sync op and no write-cache
178  * flushes.
179  *
180  * And this last workaround is tricky because of the requirements on
181  * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
182  * volume 2 part 1:
183  *
184  *     "1 of the following must also be set:
185  *      - Render Target Cache Flush Enable ([12] of DW1)
186  *      - Depth Cache Flush Enable ([0] of DW1)
187  *      - Stall at Pixel Scoreboard ([1] of DW1)
188  *      - Depth Stall ([13] of DW1)
189  *      - Post-Sync Operation ([13] of DW1)
190  *      - Notify Enable ([8] of DW1)"
191  *
192  * The cache flushes require the workaround flush that triggered this
193  * one, so we can't use it.  Depth stall would trigger the same.
194  * Post-sync nonzero is what triggered this second workaround, so we
195  * can't use that one either.  Notify enable is IRQs, which aren't
196  * really our business.  That leaves only stall at scoreboard.
197  */
198 static int
199 intel_emit_post_sync_nonzero_flush(struct drm_i915_gem_request *req)
200 {
201         struct intel_engine_cs *engine = req->engine;
202         u32 scratch_addr = engine->scratch.gtt_offset + 2 * CACHELINE_BYTES;
203         int ret;
204
205         ret = intel_ring_begin(req, 6);
206         if (ret)
207                 return ret;
208
209         intel_ring_emit(engine, GFX_OP_PIPE_CONTROL(5));
210         intel_ring_emit(engine, PIPE_CONTROL_CS_STALL |
211                         PIPE_CONTROL_STALL_AT_SCOREBOARD);
212         intel_ring_emit(engine, scratch_addr | PIPE_CONTROL_GLOBAL_GTT); /* address */
213         intel_ring_emit(engine, 0); /* low dword */
214         intel_ring_emit(engine, 0); /* high dword */
215         intel_ring_emit(engine, MI_NOOP);
216         intel_ring_advance(engine);
217
218         ret = intel_ring_begin(req, 6);
219         if (ret)
220                 return ret;
221
222         intel_ring_emit(engine, GFX_OP_PIPE_CONTROL(5));
223         intel_ring_emit(engine, PIPE_CONTROL_QW_WRITE);
224         intel_ring_emit(engine, scratch_addr | PIPE_CONTROL_GLOBAL_GTT); /* address */
225         intel_ring_emit(engine, 0);
226         intel_ring_emit(engine, 0);
227         intel_ring_emit(engine, MI_NOOP);
228         intel_ring_advance(engine);
229
230         return 0;
231 }
232
233 static int
234 gen6_render_ring_flush(struct drm_i915_gem_request *req,
235                        u32 invalidate_domains, u32 flush_domains)
236 {
237         struct intel_engine_cs *engine = req->engine;
238         u32 flags = 0;
239         u32 scratch_addr = engine->scratch.gtt_offset + 2 * CACHELINE_BYTES;
240         int ret;
241
242         /* Force SNB workarounds for PIPE_CONTROL flushes */
243         ret = intel_emit_post_sync_nonzero_flush(req);
244         if (ret)
245                 return ret;
246
247         /* Just flush everything.  Experiments have shown that reducing the
248          * number of bits based on the write domains has little performance
249          * impact.
250          */
251         if (flush_domains) {
252                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
253                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
254                 /*
255                  * Ensure that any following seqno writes only happen
256                  * when the render cache is indeed flushed.
257                  */
258                 flags |= PIPE_CONTROL_CS_STALL;
259         }
260         if (invalidate_domains) {
261                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
262                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
263                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
264                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
265                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
266                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
267                 /*
268                  * TLB invalidate requires a post-sync write.
269                  */
270                 flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
271         }
272
273         ret = intel_ring_begin(req, 4);
274         if (ret)
275                 return ret;
276
277         intel_ring_emit(engine, GFX_OP_PIPE_CONTROL(4));
278         intel_ring_emit(engine, flags);
279         intel_ring_emit(engine, scratch_addr | PIPE_CONTROL_GLOBAL_GTT);
280         intel_ring_emit(engine, 0);
281         intel_ring_advance(engine);
282
283         return 0;
284 }
285
286 static int
287 gen7_render_ring_cs_stall_wa(struct drm_i915_gem_request *req)
288 {
289         struct intel_engine_cs *engine = req->engine;
290         int ret;
291
292         ret = intel_ring_begin(req, 4);
293         if (ret)
294                 return ret;
295
296         intel_ring_emit(engine, GFX_OP_PIPE_CONTROL(4));
297         intel_ring_emit(engine, PIPE_CONTROL_CS_STALL |
298                               PIPE_CONTROL_STALL_AT_SCOREBOARD);
299         intel_ring_emit(engine, 0);
300         intel_ring_emit(engine, 0);
301         intel_ring_advance(engine);
302
303         return 0;
304 }
305
306 static int
307 gen7_render_ring_flush(struct drm_i915_gem_request *req,
308                        u32 invalidate_domains, u32 flush_domains)
309 {
310         struct intel_engine_cs *engine = req->engine;
311         u32 flags = 0;
312         u32 scratch_addr = engine->scratch.gtt_offset + 2 * CACHELINE_BYTES;
313         int ret;
314
315         /*
316          * Ensure that any following seqno writes only happen when the render
317          * cache is indeed flushed.
318          *
319          * Workaround: 4th PIPE_CONTROL command (except the ones with only
320          * read-cache invalidate bits set) must have the CS_STALL bit set. We
321          * don't try to be clever and just set it unconditionally.
322          */
323         flags |= PIPE_CONTROL_CS_STALL;
324
325         /* Just flush everything.  Experiments have shown that reducing the
326          * number of bits based on the write domains has little performance
327          * impact.
328          */
329         if (flush_domains) {
330                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
331                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
332                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
333                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
334         }
335         if (invalidate_domains) {
336                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
337                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
338                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
339                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
340                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
341                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
342                 flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;
343                 /*
344                  * TLB invalidate requires a post-sync write.
345                  */
346                 flags |= PIPE_CONTROL_QW_WRITE;
347                 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
348
349                 flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
350
351                 /* Workaround: we must issue a pipe_control with CS-stall bit
352                  * set before a pipe_control command that has the state cache
353                  * invalidate bit set. */
354                 gen7_render_ring_cs_stall_wa(req);
355         }
356
357         ret = intel_ring_begin(req, 4);
358         if (ret)
359                 return ret;
360
361         intel_ring_emit(engine, GFX_OP_PIPE_CONTROL(4));
362         intel_ring_emit(engine, flags);
363         intel_ring_emit(engine, scratch_addr);
364         intel_ring_emit(engine, 0);
365         intel_ring_advance(engine);
366
367         return 0;
368 }
369
370 static int
371 gen8_emit_pipe_control(struct drm_i915_gem_request *req,
372                        u32 flags, u32 scratch_addr)
373 {
374         struct intel_engine_cs *engine = req->engine;
375         int ret;
376
377         ret = intel_ring_begin(req, 6);
378         if (ret)
379                 return ret;
380
381         intel_ring_emit(engine, GFX_OP_PIPE_CONTROL(6));
382         intel_ring_emit(engine, flags);
383         intel_ring_emit(engine, scratch_addr);
384         intel_ring_emit(engine, 0);
385         intel_ring_emit(engine, 0);
386         intel_ring_emit(engine, 0);
387         intel_ring_advance(engine);
388
389         return 0;
390 }
391
392 static int
393 gen8_render_ring_flush(struct drm_i915_gem_request *req,
394                        u32 invalidate_domains, u32 flush_domains)
395 {
396         u32 flags = 0;
397         u32 scratch_addr = req->engine->scratch.gtt_offset + 2 * CACHELINE_BYTES;
398         int ret;
399
400         flags |= PIPE_CONTROL_CS_STALL;
401
402         if (flush_domains) {
403                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
404                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
405                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
406                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
407         }
408         if (invalidate_domains) {
409                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
410                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
411                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
412                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
413                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
414                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
415                 flags |= PIPE_CONTROL_QW_WRITE;
416                 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
417
418                 /* WaCsStallBeforeStateCacheInvalidate:bdw,chv */
419                 ret = gen8_emit_pipe_control(req,
420                                              PIPE_CONTROL_CS_STALL |
421                                              PIPE_CONTROL_STALL_AT_SCOREBOARD,
422                                              0);
423                 if (ret)
424                         return ret;
425         }
426
427         return gen8_emit_pipe_control(req, flags, scratch_addr);
428 }
429
430 static void ring_write_tail(struct intel_engine_cs *engine,
431                             u32 value)
432 {
433         struct drm_i915_private *dev_priv = engine->i915;
434         I915_WRITE_TAIL(engine, value);
435 }
436
437 u64 intel_ring_get_active_head(struct intel_engine_cs *engine)
438 {
439         struct drm_i915_private *dev_priv = engine->i915;
440         u64 acthd;
441
442         if (INTEL_GEN(dev_priv) >= 8)
443                 acthd = I915_READ64_2x32(RING_ACTHD(engine->mmio_base),
444                                          RING_ACTHD_UDW(engine->mmio_base));
445         else if (INTEL_GEN(dev_priv) >= 4)
446                 acthd = I915_READ(RING_ACTHD(engine->mmio_base));
447         else
448                 acthd = I915_READ(ACTHD);
449
450         return acthd;
451 }
452
453 static void ring_setup_phys_status_page(struct intel_engine_cs *engine)
454 {
455         struct drm_i915_private *dev_priv = engine->i915;
456         u32 addr;
457
458         addr = dev_priv->status_page_dmah->busaddr;
459         if (INTEL_GEN(dev_priv) >= 4)
460                 addr |= (dev_priv->status_page_dmah->busaddr >> 28) & 0xf0;
461         I915_WRITE(HWS_PGA, addr);
462 }
463
464 static void intel_ring_setup_status_page(struct intel_engine_cs *engine)
465 {
466         struct drm_i915_private *dev_priv = engine->i915;
467         i915_reg_t mmio;
468
469         /* The ring status page addresses are no longer next to the rest of
470          * the ring registers as of gen7.
471          */
472         if (IS_GEN7(dev_priv)) {
473                 switch (engine->id) {
474                 case RCS:
475                         mmio = RENDER_HWS_PGA_GEN7;
476                         break;
477                 case BCS:
478                         mmio = BLT_HWS_PGA_GEN7;
479                         break;
480                 /*
481                  * VCS2 actually doesn't exist on Gen7. Only shut up
482                  * gcc switch check warning
483                  */
484                 case VCS2:
485                 case VCS:
486                         mmio = BSD_HWS_PGA_GEN7;
487                         break;
488                 case VECS:
489                         mmio = VEBOX_HWS_PGA_GEN7;
490                         break;
491                 }
492         } else if (IS_GEN6(dev_priv)) {
493                 mmio = RING_HWS_PGA_GEN6(engine->mmio_base);
494         } else {
495                 /* XXX: gen8 returns to sanity */
496                 mmio = RING_HWS_PGA(engine->mmio_base);
497         }
498
499         I915_WRITE(mmio, (u32)engine->status_page.gfx_addr);
500         POSTING_READ(mmio);
501
502         /*
503          * Flush the TLB for this page
504          *
505          * FIXME: These two bits have disappeared on gen8, so a question
506          * arises: do we still need this and if so how should we go about
507          * invalidating the TLB?
508          */
509         if (IS_GEN(dev_priv, 6, 7)) {
510                 i915_reg_t reg = RING_INSTPM(engine->mmio_base);
511
512                 /* ring should be idle before issuing a sync flush*/
513                 WARN_ON((I915_READ_MODE(engine) & MODE_IDLE) == 0);
514
515                 I915_WRITE(reg,
516                            _MASKED_BIT_ENABLE(INSTPM_TLB_INVALIDATE |
517                                               INSTPM_SYNC_FLUSH));
518                 if (wait_for((I915_READ(reg) & INSTPM_SYNC_FLUSH) == 0,
519                              1000))
520                         DRM_ERROR("%s: wait for SyncFlush to complete for TLB invalidation timed out\n",
521                                   engine->name);
522         }
523 }
524
525 static bool stop_ring(struct intel_engine_cs *engine)
526 {
527         struct drm_i915_private *dev_priv = engine->i915;
528
529         if (!IS_GEN2(dev_priv)) {
530                 I915_WRITE_MODE(engine, _MASKED_BIT_ENABLE(STOP_RING));
531                 if (wait_for((I915_READ_MODE(engine) & MODE_IDLE) != 0, 1000)) {
532                         DRM_ERROR("%s : timed out trying to stop ring\n",
533                                   engine->name);
534                         /* Sometimes we observe that the idle flag is not
535                          * set even though the ring is empty. So double
536                          * check before giving up.
537                          */
538                         if (I915_READ_HEAD(engine) != I915_READ_TAIL(engine))
539                                 return false;
540                 }
541         }
542
543         I915_WRITE_CTL(engine, 0);
544         I915_WRITE_HEAD(engine, 0);
545         engine->write_tail(engine, 0);
546
547         if (!IS_GEN2(dev_priv)) {
548                 (void)I915_READ_CTL(engine);
549                 I915_WRITE_MODE(engine, _MASKED_BIT_DISABLE(STOP_RING));
550         }
551
552         return (I915_READ_HEAD(engine) & HEAD_ADDR) == 0;
553 }
554
555 void intel_engine_init_hangcheck(struct intel_engine_cs *engine)
556 {
557         memset(&engine->hangcheck, 0, sizeof(engine->hangcheck));
558 }
559
560 static int init_ring_common(struct intel_engine_cs *engine)
561 {
562         struct drm_i915_private *dev_priv = engine->i915;
563         struct intel_ringbuffer *ringbuf = engine->buffer;
564         struct drm_i915_gem_object *obj = ringbuf->obj;
565         int ret = 0;
566
567         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
568
569         if (!stop_ring(engine)) {
570                 /* G45 ring initialization often fails to reset head to zero */
571                 DRM_DEBUG_KMS("%s head not reset to zero "
572                               "ctl %08x head %08x tail %08x start %08x\n",
573                               engine->name,
574                               I915_READ_CTL(engine),
575                               I915_READ_HEAD(engine),
576                               I915_READ_TAIL(engine),
577                               I915_READ_START(engine));
578
579                 if (!stop_ring(engine)) {
580                         DRM_ERROR("failed to set %s head to zero "
581                                   "ctl %08x head %08x tail %08x start %08x\n",
582                                   engine->name,
583                                   I915_READ_CTL(engine),
584                                   I915_READ_HEAD(engine),
585                                   I915_READ_TAIL(engine),
586                                   I915_READ_START(engine));
587                         ret = -EIO;
588                         goto out;
589                 }
590         }
591
592         if (I915_NEED_GFX_HWS(dev_priv))
593                 intel_ring_setup_status_page(engine);
594         else
595                 ring_setup_phys_status_page(engine);
596
597         /* Enforce ordering by reading HEAD register back */
598         I915_READ_HEAD(engine);
599
600         /* Initialize the ring. This must happen _after_ we've cleared the ring
601          * registers with the above sequence (the readback of the HEAD registers
602          * also enforces ordering), otherwise the hw might lose the new ring
603          * register values. */
604         I915_WRITE_START(engine, i915_gem_obj_ggtt_offset(obj));
605
606         /* WaClearRingBufHeadRegAtInit:ctg,elk */
607         if (I915_READ_HEAD(engine))
608                 DRM_DEBUG("%s initialization failed [head=%08x], fudging\n",
609                           engine->name, I915_READ_HEAD(engine));
610         I915_WRITE_HEAD(engine, 0);
611         (void)I915_READ_HEAD(engine);
612
613         I915_WRITE_CTL(engine,
614                         ((ringbuf->size - PAGE_SIZE) & RING_NR_PAGES)
615                         | RING_VALID);
616
617         /* If the head is still not zero, the ring is dead */
618         if (wait_for((I915_READ_CTL(engine) & RING_VALID) != 0 &&
619                      I915_READ_START(engine) == i915_gem_obj_ggtt_offset(obj) &&
620                      (I915_READ_HEAD(engine) & HEAD_ADDR) == 0, 50)) {
621                 DRM_ERROR("%s initialization failed "
622                           "ctl %08x (valid? %d) head %08x tail %08x start %08x [expected %08lx]\n",
623                           engine->name,
624                           I915_READ_CTL(engine),
625                           I915_READ_CTL(engine) & RING_VALID,
626                           I915_READ_HEAD(engine), I915_READ_TAIL(engine),
627                           I915_READ_START(engine),
628                           (unsigned long)i915_gem_obj_ggtt_offset(obj));
629                 ret = -EIO;
630                 goto out;
631         }
632
633         ringbuf->last_retired_head = -1;
634         ringbuf->head = I915_READ_HEAD(engine);
635         ringbuf->tail = I915_READ_TAIL(engine) & TAIL_ADDR;
636         intel_ring_update_space(ringbuf);
637
638         intel_engine_init_hangcheck(engine);
639
640 out:
641         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
642
643         return ret;
644 }
645
646 void
647 intel_fini_pipe_control(struct intel_engine_cs *engine)
648 {
649         if (engine->scratch.obj == NULL)
650                 return;
651
652         if (INTEL_GEN(engine->i915) >= 5) {
653                 kunmap(sg_page(engine->scratch.obj->pages->sgl));
654                 i915_gem_object_ggtt_unpin(engine->scratch.obj);
655         }
656
657         drm_gem_object_unreference(&engine->scratch.obj->base);
658         engine->scratch.obj = NULL;
659 }
660
661 int
662 intel_init_pipe_control(struct intel_engine_cs *engine)
663 {
664         int ret;
665
666         WARN_ON(engine->scratch.obj);
667
668         engine->scratch.obj = i915_gem_object_create(engine->i915->dev, 4096);
669         if (IS_ERR(engine->scratch.obj)) {
670                 DRM_ERROR("Failed to allocate seqno page\n");
671                 ret = PTR_ERR(engine->scratch.obj);
672                 engine->scratch.obj = NULL;
673                 goto err;
674         }
675
676         ret = i915_gem_object_set_cache_level(engine->scratch.obj,
677                                               I915_CACHE_LLC);
678         if (ret)
679                 goto err_unref;
680
681         ret = i915_gem_obj_ggtt_pin(engine->scratch.obj, 4096, 0);
682         if (ret)
683                 goto err_unref;
684
685         engine->scratch.gtt_offset = i915_gem_obj_ggtt_offset(engine->scratch.obj);
686         engine->scratch.cpu_page = kmap(sg_page(engine->scratch.obj->pages->sgl));
687         if (engine->scratch.cpu_page == NULL) {
688                 ret = -ENOMEM;
689                 goto err_unpin;
690         }
691
692         DRM_DEBUG_DRIVER("%s pipe control offset: 0x%08x\n",
693                          engine->name, engine->scratch.gtt_offset);
694         return 0;
695
696 err_unpin:
697         i915_gem_object_ggtt_unpin(engine->scratch.obj);
698 err_unref:
699         drm_gem_object_unreference(&engine->scratch.obj->base);
700 err:
701         return ret;
702 }
703
704 static int intel_ring_workarounds_emit(struct drm_i915_gem_request *req)
705 {
706         struct intel_engine_cs *engine = req->engine;
707         struct i915_workarounds *w = &req->i915->workarounds;
708         int ret, i;
709
710         if (w->count == 0)
711                 return 0;
712
713         engine->gpu_caches_dirty = true;
714         ret = intel_ring_flush_all_caches(req);
715         if (ret)
716                 return ret;
717
718         ret = intel_ring_begin(req, (w->count * 2 + 2));
719         if (ret)
720                 return ret;
721
722         intel_ring_emit(engine, MI_LOAD_REGISTER_IMM(w->count));
723         for (i = 0; i < w->count; i++) {
724                 intel_ring_emit_reg(engine, w->reg[i].addr);
725                 intel_ring_emit(engine, w->reg[i].value);
726         }
727         intel_ring_emit(engine, MI_NOOP);
728
729         intel_ring_advance(engine);
730
731         engine->gpu_caches_dirty = true;
732         ret = intel_ring_flush_all_caches(req);
733         if (ret)
734                 return ret;
735
736         DRM_DEBUG_DRIVER("Number of Workarounds emitted: %d\n", w->count);
737
738         return 0;
739 }
740
741 static int intel_rcs_ctx_init(struct drm_i915_gem_request *req)
742 {
743         int ret;
744
745         ret = intel_ring_workarounds_emit(req);
746         if (ret != 0)
747                 return ret;
748
749         ret = i915_gem_render_state_init(req);
750         if (ret)
751                 return ret;
752
753         return 0;
754 }
755
756 static int wa_add(struct drm_i915_private *dev_priv,
757                   i915_reg_t addr,
758                   const u32 mask, const u32 val)
759 {
760         const u32 idx = dev_priv->workarounds.count;
761
762         if (WARN_ON(idx >= I915_MAX_WA_REGS))
763                 return -ENOSPC;
764
765         dev_priv->workarounds.reg[idx].addr = addr;
766         dev_priv->workarounds.reg[idx].value = val;
767         dev_priv->workarounds.reg[idx].mask = mask;
768
769         dev_priv->workarounds.count++;
770
771         return 0;
772 }
773
774 #define WA_REG(addr, mask, val) do { \
775                 const int r = wa_add(dev_priv, (addr), (mask), (val)); \
776                 if (r) \
777                         return r; \
778         } while (0)
779
780 #define WA_SET_BIT_MASKED(addr, mask) \
781         WA_REG(addr, (mask), _MASKED_BIT_ENABLE(mask))
782
783 #define WA_CLR_BIT_MASKED(addr, mask) \
784         WA_REG(addr, (mask), _MASKED_BIT_DISABLE(mask))
785
786 #define WA_SET_FIELD_MASKED(addr, mask, value) \
787         WA_REG(addr, mask, _MASKED_FIELD(mask, value))
788
789 #define WA_SET_BIT(addr, mask) WA_REG(addr, mask, I915_READ(addr) | (mask))
790 #define WA_CLR_BIT(addr, mask) WA_REG(addr, mask, I915_READ(addr) & ~(mask))
791
792 #define WA_WRITE(addr, val) WA_REG(addr, 0xffffffff, val)
793
794 static int wa_ring_whitelist_reg(struct intel_engine_cs *engine,
795                                  i915_reg_t reg)
796 {
797         struct drm_i915_private *dev_priv = engine->i915;
798         struct i915_workarounds *wa = &dev_priv->workarounds;
799         const uint32_t index = wa->hw_whitelist_count[engine->id];
800
801         if (WARN_ON(index >= RING_MAX_NONPRIV_SLOTS))
802                 return -EINVAL;
803
804         WA_WRITE(RING_FORCE_TO_NONPRIV(engine->mmio_base, index),
805                  i915_mmio_reg_offset(reg));
806         wa->hw_whitelist_count[engine->id]++;
807
808         return 0;
809 }
810
811 static int gen8_init_workarounds(struct intel_engine_cs *engine)
812 {
813         struct drm_i915_private *dev_priv = engine->i915;
814
815         WA_SET_BIT_MASKED(INSTPM, INSTPM_FORCE_ORDERING);
816
817         /* WaDisableAsyncFlipPerfMode:bdw,chv */
818         WA_SET_BIT_MASKED(MI_MODE, ASYNC_FLIP_PERF_DISABLE);
819
820         /* WaDisablePartialInstShootdown:bdw,chv */
821         WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
822                           PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
823
824         /* Use Force Non-Coherent whenever executing a 3D context. This is a
825          * workaround for for a possible hang in the unlikely event a TLB
826          * invalidation occurs during a PSD flush.
827          */
828         /* WaForceEnableNonCoherent:bdw,chv */
829         /* WaHdcDisableFetchWhenMasked:bdw,chv */
830         WA_SET_BIT_MASKED(HDC_CHICKEN0,
831                           HDC_DONOT_FETCH_MEM_WHEN_MASKED |
832                           HDC_FORCE_NON_COHERENT);
833
834         /* From the Haswell PRM, Command Reference: Registers, CACHE_MODE_0:
835          * "The Hierarchical Z RAW Stall Optimization allows non-overlapping
836          *  polygons in the same 8x4 pixel/sample area to be processed without
837          *  stalling waiting for the earlier ones to write to Hierarchical Z
838          *  buffer."
839          *
840          * This optimization is off by default for BDW and CHV; turn it on.
841          */
842         WA_CLR_BIT_MASKED(CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE);
843
844         /* Wa4x4STCOptimizationDisable:bdw,chv */
845         WA_SET_BIT_MASKED(CACHE_MODE_1, GEN8_4x4_STC_OPTIMIZATION_DISABLE);
846
847         /*
848          * BSpec recommends 8x4 when MSAA is used,
849          * however in practice 16x4 seems fastest.
850          *
851          * Note that PS/WM thread counts depend on the WIZ hashing
852          * disable bit, which we don't touch here, but it's good
853          * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
854          */
855         WA_SET_FIELD_MASKED(GEN7_GT_MODE,
856                             GEN6_WIZ_HASHING_MASK,
857                             GEN6_WIZ_HASHING_16x4);
858
859         return 0;
860 }
861
862 static int bdw_init_workarounds(struct intel_engine_cs *engine)
863 {
864         struct drm_i915_private *dev_priv = engine->i915;
865         int ret;
866
867         ret = gen8_init_workarounds(engine);
868         if (ret)
869                 return ret;
870
871         /* WaDisableThreadStallDopClockGating:bdw (pre-production) */
872         WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
873
874         /* WaDisableDopClockGating:bdw */
875         WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2,
876                           DOP_CLOCK_GATING_DISABLE);
877
878         WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3,
879                           GEN8_SAMPLER_POWER_BYPASS_DIS);
880
881         WA_SET_BIT_MASKED(HDC_CHICKEN0,
882                           /* WaForceContextSaveRestoreNonCoherent:bdw */
883                           HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
884                           /* WaDisableFenceDestinationToSLM:bdw (pre-prod) */
885                           (IS_BDW_GT3(dev_priv) ? HDC_FENCE_DEST_SLM_DISABLE : 0));
886
887         return 0;
888 }
889
890 static int chv_init_workarounds(struct intel_engine_cs *engine)
891 {
892         struct drm_i915_private *dev_priv = engine->i915;
893         int ret;
894
895         ret = gen8_init_workarounds(engine);
896         if (ret)
897                 return ret;
898
899         /* WaDisableThreadStallDopClockGating:chv */
900         WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
901
902         /* Improve HiZ throughput on CHV. */
903         WA_SET_BIT_MASKED(HIZ_CHICKEN, CHV_HZ_8X8_MODE_IN_1X);
904
905         return 0;
906 }
907
908 static int gen9_init_workarounds(struct intel_engine_cs *engine)
909 {
910         struct drm_i915_private *dev_priv = engine->i915;
911         uint32_t tmp;
912         int ret;
913
914         /* WaEnableLbsSlaRetryTimerDecrement:skl */
915         I915_WRITE(BDW_SCRATCH1, I915_READ(BDW_SCRATCH1) |
916                    GEN9_LBS_SLA_RETRY_TIMER_DECREMENT_ENABLE);
917
918         /* WaDisableKillLogic:bxt,skl */
919         I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) |
920                    ECOCHK_DIS_TLB);
921
922         /* WaClearFlowControlGpgpuContextSave:skl,bxt */
923         /* WaDisablePartialInstShootdown:skl,bxt */
924         WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
925                           FLOW_CONTROL_ENABLE |
926                           PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
927
928         /* Syncing dependencies between camera and graphics:skl,bxt */
929         WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3,
930                           GEN9_DISABLE_OCL_OOB_SUPPRESS_LOGIC);
931
932         /* WaDisableDgMirrorFixInHalfSliceChicken5:skl,bxt */
933         if (IS_SKL_REVID(dev_priv, 0, SKL_REVID_B0) ||
934             IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1))
935                 WA_CLR_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN5,
936                                   GEN9_DG_MIRROR_FIX_ENABLE);
937
938         /* WaSetDisablePixMaskCammingAndRhwoInCommonSliceChicken:skl,bxt */
939         if (IS_SKL_REVID(dev_priv, 0, SKL_REVID_B0) ||
940             IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) {
941                 WA_SET_BIT_MASKED(GEN7_COMMON_SLICE_CHICKEN1,
942                                   GEN9_RHWO_OPTIMIZATION_DISABLE);
943                 /*
944                  * WA also requires GEN9_SLICE_COMMON_ECO_CHICKEN0[14:14] to be set
945                  * but we do that in per ctx batchbuffer as there is an issue
946                  * with this register not getting restored on ctx restore
947                  */
948         }
949
950         /* WaEnableYV12BugFixInHalfSliceChicken7:skl,bxt */
951         /* WaEnableSamplerGPGPUPreemptionSupport:skl,bxt */
952         WA_SET_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN7,
953                           GEN9_ENABLE_YV12_BUGFIX |
954                           GEN9_ENABLE_GPGPU_PREEMPTION);
955
956         /* Wa4x4STCOptimizationDisable:skl,bxt */
957         /* WaDisablePartialResolveInVc:skl,bxt */
958         WA_SET_BIT_MASKED(CACHE_MODE_1, (GEN8_4x4_STC_OPTIMIZATION_DISABLE |
959                                          GEN9_PARTIAL_RESOLVE_IN_VC_DISABLE));
960
961         /* WaCcsTlbPrefetchDisable:skl,bxt */
962         WA_CLR_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN5,
963                           GEN9_CCS_TLB_PREFETCH_ENABLE);
964
965         /* WaDisableMaskBasedCammingInRCC:skl,bxt */
966         if (IS_SKL_REVID(dev_priv, SKL_REVID_C0, SKL_REVID_C0) ||
967             IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1))
968                 WA_SET_BIT_MASKED(SLICE_ECO_CHICKEN0,
969                                   PIXEL_MASK_CAMMING_DISABLE);
970
971         /* WaForceContextSaveRestoreNonCoherent:skl,bxt */
972         tmp = HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT;
973         if (IS_SKL_REVID(dev_priv, SKL_REVID_F0, REVID_FOREVER) ||
974             IS_BXT_REVID(dev_priv, BXT_REVID_B0, REVID_FOREVER))
975                 tmp |= HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE;
976         WA_SET_BIT_MASKED(HDC_CHICKEN0, tmp);
977
978         /* WaDisableSamplerPowerBypassForSOPingPong:skl,bxt */
979         if (IS_SKYLAKE(dev_priv) || IS_BXT_REVID(dev_priv, 0, BXT_REVID_B0))
980                 WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3,
981                                   GEN8_SAMPLER_POWER_BYPASS_DIS);
982
983         /* WaDisableSTUnitPowerOptimization:skl,bxt */
984         WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN2, GEN8_ST_PO_DISABLE);
985
986         /* WaOCLCoherentLineFlush:skl,bxt */
987         I915_WRITE(GEN8_L3SQCREG4, (I915_READ(GEN8_L3SQCREG4) |
988                                     GEN8_LQSC_FLUSH_COHERENT_LINES));
989
990         /* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt */
991         ret = wa_ring_whitelist_reg(engine, GEN9_CTX_PREEMPT_REG);
992         if (ret)
993                 return ret;
994
995         /* WaEnablePreemptionGranularityControlByUMD:skl,bxt */
996         ret= wa_ring_whitelist_reg(engine, GEN8_CS_CHICKEN1);
997         if (ret)
998                 return ret;
999
1000         /* WaAllowUMDToModifyHDCChicken1:skl,bxt */
1001         ret = wa_ring_whitelist_reg(engine, GEN8_HDC_CHICKEN1);
1002         if (ret)
1003                 return ret;
1004
1005         return 0;
1006 }
1007
1008 static int skl_tune_iz_hashing(struct intel_engine_cs *engine)
1009 {
1010         struct drm_i915_private *dev_priv = engine->i915;
1011         u8 vals[3] = { 0, 0, 0 };
1012         unsigned int i;
1013
1014         for (i = 0; i < 3; i++) {
1015                 u8 ss;
1016
1017                 /*
1018                  * Only consider slices where one, and only one, subslice has 7
1019                  * EUs
1020                  */
1021                 if (!is_power_of_2(dev_priv->info.subslice_7eu[i]))
1022                         continue;
1023
1024                 /*
1025                  * subslice_7eu[i] != 0 (because of the check above) and
1026                  * ss_max == 4 (maximum number of subslices possible per slice)
1027                  *
1028                  * ->    0 <= ss <= 3;
1029                  */
1030                 ss = ffs(dev_priv->info.subslice_7eu[i]) - 1;
1031                 vals[i] = 3 - ss;
1032         }
1033
1034         if (vals[0] == 0 && vals[1] == 0 && vals[2] == 0)
1035                 return 0;
1036
1037         /* Tune IZ hashing. See intel_device_info_runtime_init() */
1038         WA_SET_FIELD_MASKED(GEN7_GT_MODE,
1039                             GEN9_IZ_HASHING_MASK(2) |
1040                             GEN9_IZ_HASHING_MASK(1) |
1041                             GEN9_IZ_HASHING_MASK(0),
1042                             GEN9_IZ_HASHING(2, vals[2]) |
1043                             GEN9_IZ_HASHING(1, vals[1]) |
1044                             GEN9_IZ_HASHING(0, vals[0]));
1045
1046         return 0;
1047 }
1048
1049 static int skl_init_workarounds(struct intel_engine_cs *engine)
1050 {
1051         struct drm_i915_private *dev_priv = engine->i915;
1052         int ret;
1053
1054         ret = gen9_init_workarounds(engine);
1055         if (ret)
1056                 return ret;
1057
1058         /*
1059          * Actual WA is to disable percontext preemption granularity control
1060          * until D0 which is the default case so this is equivalent to
1061          * !WaDisablePerCtxtPreemptionGranularityControl:skl
1062          */
1063         if (IS_SKL_REVID(dev_priv, SKL_REVID_E0, REVID_FOREVER)) {
1064                 I915_WRITE(GEN7_FF_SLICE_CS_CHICKEN1,
1065                            _MASKED_BIT_ENABLE(GEN9_FFSC_PERCTX_PREEMPT_CTRL));
1066         }
1067
1068         if (IS_SKL_REVID(dev_priv, 0, SKL_REVID_D0)) {
1069                 /* WaDisableChickenBitTSGBarrierAckForFFSliceCS:skl */
1070                 I915_WRITE(FF_SLICE_CS_CHICKEN2,
1071                            _MASKED_BIT_ENABLE(GEN9_TSG_BARRIER_ACK_DISABLE));
1072         }
1073
1074         /* GEN8_L3SQCREG4 has a dependency with WA batch so any new changes
1075          * involving this register should also be added to WA batch as required.
1076          */
1077         if (IS_SKL_REVID(dev_priv, 0, SKL_REVID_E0))
1078                 /* WaDisableLSQCROPERFforOCL:skl */
1079                 I915_WRITE(GEN8_L3SQCREG4, I915_READ(GEN8_L3SQCREG4) |
1080                            GEN8_LQSC_RO_PERF_DIS);
1081
1082         /* WaEnableGapsTsvCreditFix:skl */
1083         if (IS_SKL_REVID(dev_priv, SKL_REVID_C0, REVID_FOREVER)) {
1084                 I915_WRITE(GEN8_GARBCNTL, (I915_READ(GEN8_GARBCNTL) |
1085                                            GEN9_GAPS_TSV_CREDIT_DISABLE));
1086         }
1087
1088         /* WaDisablePowerCompilerClockGating:skl */
1089         if (IS_SKL_REVID(dev_priv, SKL_REVID_B0, SKL_REVID_B0))
1090                 WA_SET_BIT_MASKED(HIZ_CHICKEN,
1091                                   BDW_HIZ_POWER_COMPILER_CLOCK_GATING_DISABLE);
1092
1093         /* This is tied to WaForceContextSaveRestoreNonCoherent */
1094         if (IS_SKL_REVID(dev_priv, 0, REVID_FOREVER)) {
1095                 /*
1096                  *Use Force Non-Coherent whenever executing a 3D context. This
1097                  * is a workaround for a possible hang in the unlikely event
1098                  * a TLB invalidation occurs during a PSD flush.
1099                  */
1100                 /* WaForceEnableNonCoherent:skl */
1101                 WA_SET_BIT_MASKED(HDC_CHICKEN0,
1102                                   HDC_FORCE_NON_COHERENT);
1103
1104                 /* WaDisableHDCInvalidation:skl */
1105                 I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) |
1106                            BDW_DISABLE_HDC_INVALIDATION);
1107         }
1108
1109         /* WaBarrierPerformanceFixDisable:skl */
1110         if (IS_SKL_REVID(dev_priv, SKL_REVID_C0, SKL_REVID_D0))
1111                 WA_SET_BIT_MASKED(HDC_CHICKEN0,
1112                                   HDC_FENCE_DEST_SLM_DISABLE |
1113                                   HDC_BARRIER_PERFORMANCE_DISABLE);
1114
1115         /* WaDisableSbeCacheDispatchPortSharing:skl */
1116         if (IS_SKL_REVID(dev_priv, 0, SKL_REVID_F0))
1117                 WA_SET_BIT_MASKED(
1118                         GEN7_HALF_SLICE_CHICKEN1,
1119                         GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
1120
1121         /* WaDisableGafsUnitClkGating:skl */
1122         WA_SET_BIT(GEN7_UCGCTL4, GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1123
1124         /* WaDisableLSQCROPERFforOCL:skl */
1125         ret = wa_ring_whitelist_reg(engine, GEN8_L3SQCREG4);
1126         if (ret)
1127                 return ret;
1128
1129         return skl_tune_iz_hashing(engine);
1130 }
1131
1132 static int bxt_init_workarounds(struct intel_engine_cs *engine)
1133 {
1134         struct drm_i915_private *dev_priv = engine->i915;
1135         int ret;
1136
1137         ret = gen9_init_workarounds(engine);
1138         if (ret)
1139                 return ret;
1140
1141         /* WaStoreMultiplePTEenable:bxt */
1142         /* This is a requirement according to Hardware specification */
1143         if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1))
1144                 I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_TLBPF);
1145
1146         /* WaSetClckGatingDisableMedia:bxt */
1147         if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) {
1148                 I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) &
1149                                             ~GEN8_DOP_CLOCK_GATE_MEDIA_ENABLE));
1150         }
1151
1152         /* WaDisableThreadStallDopClockGating:bxt */
1153         WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
1154                           STALL_DOP_GATING_DISABLE);
1155
1156         /* WaDisableSbeCacheDispatchPortSharing:bxt */
1157         if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_B0)) {
1158                 WA_SET_BIT_MASKED(
1159                         GEN7_HALF_SLICE_CHICKEN1,
1160                         GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
1161         }
1162
1163         /* WaDisableObjectLevelPreemptionForTrifanOrPolygon:bxt */
1164         /* WaDisableObjectLevelPreemptionForInstancedDraw:bxt */
1165         /* WaDisableObjectLevelPreemtionForInstanceId:bxt */
1166         /* WaDisableLSQCROPERFforOCL:bxt */
1167         if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) {
1168                 ret = wa_ring_whitelist_reg(engine, GEN9_CS_DEBUG_MODE1);
1169                 if (ret)
1170                         return ret;
1171
1172                 ret = wa_ring_whitelist_reg(engine, GEN8_L3SQCREG4);
1173                 if (ret)
1174                         return ret;
1175         }
1176
1177         /* WaProgramL3SqcReg1DefaultForPerf:bxt */
1178         if (IS_BXT_REVID(dev_priv, BXT_REVID_B0, REVID_FOREVER))
1179                 I915_WRITE(GEN8_L3SQCREG1, L3_GENERAL_PRIO_CREDITS(62) |
1180                                            L3_HIGH_PRIO_CREDITS(2));
1181
1182         return 0;
1183 }
1184
1185 int init_workarounds_ring(struct intel_engine_cs *engine)
1186 {
1187         struct drm_i915_private *dev_priv = engine->i915;
1188
1189         WARN_ON(engine->id != RCS);
1190
1191         dev_priv->workarounds.count = 0;
1192         dev_priv->workarounds.hw_whitelist_count[RCS] = 0;
1193
1194         if (IS_BROADWELL(dev_priv))
1195                 return bdw_init_workarounds(engine);
1196
1197         if (IS_CHERRYVIEW(dev_priv))
1198                 return chv_init_workarounds(engine);
1199
1200         if (IS_SKYLAKE(dev_priv))
1201                 return skl_init_workarounds(engine);
1202
1203         if (IS_BROXTON(dev_priv))
1204                 return bxt_init_workarounds(engine);
1205
1206         return 0;
1207 }
1208
1209 static int init_render_ring(struct intel_engine_cs *engine)
1210 {
1211         struct drm_i915_private *dev_priv = engine->i915;
1212         int ret = init_ring_common(engine);
1213         if (ret)
1214                 return ret;
1215
1216         /* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */
1217         if (IS_GEN(dev_priv, 4, 6))
1218                 I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH));
1219
1220         /* We need to disable the AsyncFlip performance optimisations in order
1221          * to use MI_WAIT_FOR_EVENT within the CS. It should already be
1222          * programmed to '1' on all products.
1223          *
1224          * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv
1225          */
1226         if (IS_GEN(dev_priv, 6, 7))
1227                 I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(ASYNC_FLIP_PERF_DISABLE));
1228
1229         /* Required for the hardware to program scanline values for waiting */
1230         /* WaEnableFlushTlbInvalidationMode:snb */
1231         if (IS_GEN6(dev_priv))
1232                 I915_WRITE(GFX_MODE,
1233                            _MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_EXPLICIT));
1234
1235         /* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */
1236         if (IS_GEN7(dev_priv))
1237                 I915_WRITE(GFX_MODE_GEN7,
1238                            _MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_EXPLICIT) |
1239                            _MASKED_BIT_ENABLE(GFX_REPLAY_MODE));
1240
1241         if (IS_GEN6(dev_priv)) {
1242                 /* From the Sandybridge PRM, volume 1 part 3, page 24:
1243                  * "If this bit is set, STCunit will have LRA as replacement
1244                  *  policy. [...] This bit must be reset.  LRA replacement
1245                  *  policy is not supported."
1246                  */
1247                 I915_WRITE(CACHE_MODE_0,
1248                            _MASKED_BIT_DISABLE(CM0_STC_EVICT_DISABLE_LRA_SNB));
1249         }
1250
1251         if (IS_GEN(dev_priv, 6, 7))
1252                 I915_WRITE(INSTPM, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
1253
1254         if (HAS_L3_DPF(dev_priv))
1255                 I915_WRITE_IMR(engine, ~GT_PARITY_ERROR(dev_priv));
1256
1257         return init_workarounds_ring(engine);
1258 }
1259
1260 static void render_ring_cleanup(struct intel_engine_cs *engine)
1261 {
1262         struct drm_i915_private *dev_priv = engine->i915;
1263
1264         if (dev_priv->semaphore_obj) {
1265                 i915_gem_object_ggtt_unpin(dev_priv->semaphore_obj);
1266                 drm_gem_object_unreference(&dev_priv->semaphore_obj->base);
1267                 dev_priv->semaphore_obj = NULL;
1268         }
1269
1270         intel_fini_pipe_control(engine);
1271 }
1272
1273 static int gen8_rcs_signal(struct drm_i915_gem_request *signaller_req,
1274                            unsigned int num_dwords)
1275 {
1276 #define MBOX_UPDATE_DWORDS 8
1277         struct intel_engine_cs *signaller = signaller_req->engine;
1278         struct drm_i915_private *dev_priv = signaller_req->i915;
1279         struct intel_engine_cs *waiter;
1280         enum intel_engine_id id;
1281         int ret, num_rings;
1282
1283         num_rings = hweight32(INTEL_INFO(dev_priv)->ring_mask);
1284         num_dwords += (num_rings-1) * MBOX_UPDATE_DWORDS;
1285 #undef MBOX_UPDATE_DWORDS
1286
1287         ret = intel_ring_begin(signaller_req, num_dwords);
1288         if (ret)
1289                 return ret;
1290
1291         for_each_engine_id(waiter, dev_priv, id) {
1292                 u32 seqno;
1293                 u64 gtt_offset = signaller->semaphore.signal_ggtt[id];
1294                 if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
1295                         continue;
1296
1297                 seqno = i915_gem_request_get_seqno(signaller_req);
1298                 intel_ring_emit(signaller, GFX_OP_PIPE_CONTROL(6));
1299                 intel_ring_emit(signaller, PIPE_CONTROL_GLOBAL_GTT_IVB |
1300                                            PIPE_CONTROL_QW_WRITE |
1301                                            PIPE_CONTROL_CS_STALL);
1302                 intel_ring_emit(signaller, lower_32_bits(gtt_offset));
1303                 intel_ring_emit(signaller, upper_32_bits(gtt_offset));
1304                 intel_ring_emit(signaller, seqno);
1305                 intel_ring_emit(signaller, 0);
1306                 intel_ring_emit(signaller, MI_SEMAPHORE_SIGNAL |
1307                                            MI_SEMAPHORE_TARGET(waiter->hw_id));
1308                 intel_ring_emit(signaller, 0);
1309         }
1310
1311         return 0;
1312 }
1313
1314 static int gen8_xcs_signal(struct drm_i915_gem_request *signaller_req,
1315                            unsigned int num_dwords)
1316 {
1317 #define MBOX_UPDATE_DWORDS 6
1318         struct intel_engine_cs *signaller = signaller_req->engine;
1319         struct drm_i915_private *dev_priv = signaller_req->i915;
1320         struct intel_engine_cs *waiter;
1321         enum intel_engine_id id;
1322         int ret, num_rings;
1323
1324         num_rings = hweight32(INTEL_INFO(dev_priv)->ring_mask);
1325         num_dwords += (num_rings-1) * MBOX_UPDATE_DWORDS;
1326 #undef MBOX_UPDATE_DWORDS
1327
1328         ret = intel_ring_begin(signaller_req, num_dwords);
1329         if (ret)
1330                 return ret;
1331
1332         for_each_engine_id(waiter, dev_priv, id) {
1333                 u32 seqno;
1334                 u64 gtt_offset = signaller->semaphore.signal_ggtt[id];
1335                 if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
1336                         continue;
1337
1338                 seqno = i915_gem_request_get_seqno(signaller_req);
1339                 intel_ring_emit(signaller, (MI_FLUSH_DW + 1) |
1340                                            MI_FLUSH_DW_OP_STOREDW);
1341                 intel_ring_emit(signaller, lower_32_bits(gtt_offset) |
1342                                            MI_FLUSH_DW_USE_GTT);
1343                 intel_ring_emit(signaller, upper_32_bits(gtt_offset));
1344                 intel_ring_emit(signaller, seqno);
1345                 intel_ring_emit(signaller, MI_SEMAPHORE_SIGNAL |
1346                                            MI_SEMAPHORE_TARGET(waiter->hw_id));
1347                 intel_ring_emit(signaller, 0);
1348         }
1349
1350         return 0;
1351 }
1352
1353 static int gen6_signal(struct drm_i915_gem_request *signaller_req,
1354                        unsigned int num_dwords)
1355 {
1356         struct intel_engine_cs *signaller = signaller_req->engine;
1357         struct drm_i915_private *dev_priv = signaller_req->i915;
1358         struct intel_engine_cs *useless;
1359         enum intel_engine_id id;
1360         int ret, num_rings;
1361
1362 #define MBOX_UPDATE_DWORDS 3
1363         num_rings = hweight32(INTEL_INFO(dev_priv)->ring_mask);
1364         num_dwords += round_up((num_rings-1) * MBOX_UPDATE_DWORDS, 2);
1365 #undef MBOX_UPDATE_DWORDS
1366
1367         ret = intel_ring_begin(signaller_req, num_dwords);
1368         if (ret)
1369                 return ret;
1370
1371         for_each_engine_id(useless, dev_priv, id) {
1372                 i915_reg_t mbox_reg = signaller->semaphore.mbox.signal[id];
1373
1374                 if (i915_mmio_reg_valid(mbox_reg)) {
1375                         u32 seqno = i915_gem_request_get_seqno(signaller_req);
1376
1377                         intel_ring_emit(signaller, MI_LOAD_REGISTER_IMM(1));
1378                         intel_ring_emit_reg(signaller, mbox_reg);
1379                         intel_ring_emit(signaller, seqno);
1380                 }
1381         }
1382
1383         /* If num_dwords was rounded, make sure the tail pointer is correct */
1384         if (num_rings % 2 == 0)
1385                 intel_ring_emit(signaller, MI_NOOP);
1386
1387         return 0;
1388 }
1389
1390 /**
1391  * gen6_add_request - Update the semaphore mailbox registers
1392  *
1393  * @request - request to write to the ring
1394  *
1395  * Update the mailbox registers in the *other* rings with the current seqno.
1396  * This acts like a signal in the canonical semaphore.
1397  */
1398 static int
1399 gen6_add_request(struct drm_i915_gem_request *req)
1400 {
1401         struct intel_engine_cs *engine = req->engine;
1402         int ret;
1403
1404         if (engine->semaphore.signal)
1405                 ret = engine->semaphore.signal(req, 4);
1406         else
1407                 ret = intel_ring_begin(req, 4);
1408
1409         if (ret)
1410                 return ret;
1411
1412         intel_ring_emit(engine, MI_STORE_DWORD_INDEX);
1413         intel_ring_emit(engine,
1414                         I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT);
1415         intel_ring_emit(engine, i915_gem_request_get_seqno(req));
1416         intel_ring_emit(engine, MI_USER_INTERRUPT);
1417         __intel_ring_advance(engine);
1418
1419         return 0;
1420 }
1421
1422 static int
1423 gen8_render_add_request(struct drm_i915_gem_request *req)
1424 {
1425         struct intel_engine_cs *engine = req->engine;
1426         int ret;
1427
1428         if (engine->semaphore.signal)
1429                 ret = engine->semaphore.signal(req, 8);
1430         else
1431                 ret = intel_ring_begin(req, 8);
1432         if (ret)
1433                 return ret;
1434
1435         intel_ring_emit(engine, GFX_OP_PIPE_CONTROL(6));
1436         intel_ring_emit(engine, (PIPE_CONTROL_GLOBAL_GTT_IVB |
1437                                  PIPE_CONTROL_CS_STALL |
1438                                  PIPE_CONTROL_QW_WRITE));
1439         intel_ring_emit(engine, intel_hws_seqno_address(req->engine));
1440         intel_ring_emit(engine, 0);
1441         intel_ring_emit(engine, i915_gem_request_get_seqno(req));
1442         /* We're thrashing one dword of HWS. */
1443         intel_ring_emit(engine, 0);
1444         intel_ring_emit(engine, MI_USER_INTERRUPT);
1445         intel_ring_emit(engine, MI_NOOP);
1446         __intel_ring_advance(engine);
1447
1448         return 0;
1449 }
1450
1451 static inline bool i915_gem_has_seqno_wrapped(struct drm_i915_private *dev_priv,
1452                                               u32 seqno)
1453 {
1454         return dev_priv->last_seqno < seqno;
1455 }
1456
1457 /**
1458  * intel_ring_sync - sync the waiter to the signaller on seqno
1459  *
1460  * @waiter - ring that is waiting
1461  * @signaller - ring which has, or will signal
1462  * @seqno - seqno which the waiter will block on
1463  */
1464
1465 static int
1466 gen8_ring_sync(struct drm_i915_gem_request *waiter_req,
1467                struct intel_engine_cs *signaller,
1468                u32 seqno)
1469 {
1470         struct intel_engine_cs *waiter = waiter_req->engine;
1471         struct drm_i915_private *dev_priv = waiter_req->i915;
1472         struct i915_hw_ppgtt *ppgtt;
1473         int ret;
1474
1475         ret = intel_ring_begin(waiter_req, 4);
1476         if (ret)
1477                 return ret;
1478
1479         intel_ring_emit(waiter, MI_SEMAPHORE_WAIT |
1480                                 MI_SEMAPHORE_GLOBAL_GTT |
1481                                 MI_SEMAPHORE_SAD_GTE_SDD);
1482         intel_ring_emit(waiter, seqno);
1483         intel_ring_emit(waiter,
1484                         lower_32_bits(GEN8_WAIT_OFFSET(waiter, signaller->id)));
1485         intel_ring_emit(waiter,
1486                         upper_32_bits(GEN8_WAIT_OFFSET(waiter, signaller->id)));
1487         intel_ring_advance(waiter);
1488
1489         /* When the !RCS engines idle waiting upon a semaphore, they lose their
1490          * pagetables and we must reload them before executing the batch.
1491          * We do this on the i915_switch_context() following the wait and
1492          * before the dispatch.
1493          */
1494         ppgtt = waiter_req->ctx->ppgtt;
1495         if (ppgtt && waiter_req->engine->id != RCS)
1496                 ppgtt->pd_dirty_rings |= intel_engine_flag(waiter_req->engine);
1497         return 0;
1498 }
1499
1500 static int
1501 gen6_ring_sync(struct drm_i915_gem_request *waiter_req,
1502                struct intel_engine_cs *signaller,
1503                u32 seqno)
1504 {
1505         struct intel_engine_cs *waiter = waiter_req->engine;
1506         u32 dw1 = MI_SEMAPHORE_MBOX |
1507                   MI_SEMAPHORE_COMPARE |
1508                   MI_SEMAPHORE_REGISTER;
1509         u32 wait_mbox = signaller->semaphore.mbox.wait[waiter->id];
1510         int ret;
1511
1512         /* Throughout all of the GEM code, seqno passed implies our current
1513          * seqno is >= the last seqno executed. However for hardware the
1514          * comparison is strictly greater than.
1515          */
1516         seqno -= 1;
1517
1518         WARN_ON(wait_mbox == MI_SEMAPHORE_SYNC_INVALID);
1519
1520         ret = intel_ring_begin(waiter_req, 4);
1521         if (ret)
1522                 return ret;
1523
1524         /* If seqno wrap happened, omit the wait with no-ops */
1525         if (likely(!i915_gem_has_seqno_wrapped(waiter_req->i915, seqno))) {
1526                 intel_ring_emit(waiter, dw1 | wait_mbox);
1527                 intel_ring_emit(waiter, seqno);
1528                 intel_ring_emit(waiter, 0);
1529                 intel_ring_emit(waiter, MI_NOOP);
1530         } else {
1531                 intel_ring_emit(waiter, MI_NOOP);
1532                 intel_ring_emit(waiter, MI_NOOP);
1533                 intel_ring_emit(waiter, MI_NOOP);
1534                 intel_ring_emit(waiter, MI_NOOP);
1535         }
1536         intel_ring_advance(waiter);
1537
1538         return 0;
1539 }
1540
1541 #define PIPE_CONTROL_FLUSH(ring__, addr__)                                      \
1542 do {                                                                    \
1543         intel_ring_emit(ring__, GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE |                \
1544                  PIPE_CONTROL_DEPTH_STALL);                             \
1545         intel_ring_emit(ring__, (addr__) | PIPE_CONTROL_GLOBAL_GTT);                    \
1546         intel_ring_emit(ring__, 0);                                                     \
1547         intel_ring_emit(ring__, 0);                                                     \
1548 } while (0)
1549
1550 static int
1551 pc_render_add_request(struct drm_i915_gem_request *req)
1552 {
1553         struct intel_engine_cs *engine = req->engine;
1554         u32 scratch_addr = engine->scratch.gtt_offset + 2 * CACHELINE_BYTES;
1555         int ret;
1556
1557         /* For Ironlake, MI_USER_INTERRUPT was deprecated and apparently
1558          * incoherent with writes to memory, i.e. completely fubar,
1559          * so we need to use PIPE_NOTIFY instead.
1560          *
1561          * However, we also need to workaround the qword write
1562          * incoherence by flushing the 6 PIPE_NOTIFY buffers out to
1563          * memory before requesting an interrupt.
1564          */
1565         ret = intel_ring_begin(req, 32);
1566         if (ret)
1567                 return ret;
1568
1569         intel_ring_emit(engine,
1570                         GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE |
1571                         PIPE_CONTROL_WRITE_FLUSH |
1572                         PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
1573         intel_ring_emit(engine,
1574                         engine->scratch.gtt_offset | PIPE_CONTROL_GLOBAL_GTT);
1575         intel_ring_emit(engine, i915_gem_request_get_seqno(req));
1576         intel_ring_emit(engine, 0);
1577         PIPE_CONTROL_FLUSH(engine, scratch_addr);
1578         scratch_addr += 2 * CACHELINE_BYTES; /* write to separate cachelines */
1579         PIPE_CONTROL_FLUSH(engine, scratch_addr);
1580         scratch_addr += 2 * CACHELINE_BYTES;
1581         PIPE_CONTROL_FLUSH(engine, scratch_addr);
1582         scratch_addr += 2 * CACHELINE_BYTES;
1583         PIPE_CONTROL_FLUSH(engine, scratch_addr);
1584         scratch_addr += 2 * CACHELINE_BYTES;
1585         PIPE_CONTROL_FLUSH(engine, scratch_addr);
1586         scratch_addr += 2 * CACHELINE_BYTES;
1587         PIPE_CONTROL_FLUSH(engine, scratch_addr);
1588
1589         intel_ring_emit(engine,
1590                         GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE |
1591                         PIPE_CONTROL_WRITE_FLUSH |
1592                         PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
1593                         PIPE_CONTROL_NOTIFY);
1594         intel_ring_emit(engine,
1595                         engine->scratch.gtt_offset | PIPE_CONTROL_GLOBAL_GTT);
1596         intel_ring_emit(engine, i915_gem_request_get_seqno(req));
1597         intel_ring_emit(engine, 0);
1598         __intel_ring_advance(engine);
1599
1600         return 0;
1601 }
1602
1603 static void
1604 gen6_seqno_barrier(struct intel_engine_cs *engine)
1605 {
1606         struct drm_i915_private *dev_priv = engine->i915;
1607
1608         /* Workaround to force correct ordering between irq and seqno writes on
1609          * ivb (and maybe also on snb) by reading from a CS register (like
1610          * ACTHD) before reading the status page.
1611          *
1612          * Note that this effectively stalls the read by the time it takes to
1613          * do a memory transaction, which more or less ensures that the write
1614          * from the GPU has sufficient time to invalidate the CPU cacheline.
1615          * Alternatively we could delay the interrupt from the CS ring to give
1616          * the write time to land, but that would incur a delay after every
1617          * batch i.e. much more frequent than a delay when waiting for the
1618          * interrupt (with the same net latency).
1619          *
1620          * Also note that to prevent whole machine hangs on gen7, we have to
1621          * take the spinlock to guard against concurrent cacheline access.
1622          */
1623         spin_lock_irq(&dev_priv->uncore.lock);
1624         POSTING_READ_FW(RING_ACTHD(engine->mmio_base));
1625         spin_unlock_irq(&dev_priv->uncore.lock);
1626 }
1627
1628 static u32
1629 ring_get_seqno(struct intel_engine_cs *engine)
1630 {
1631         return intel_read_status_page(engine, I915_GEM_HWS_INDEX);
1632 }
1633
1634 static void
1635 ring_set_seqno(struct intel_engine_cs *engine, u32 seqno)
1636 {
1637         intel_write_status_page(engine, I915_GEM_HWS_INDEX, seqno);
1638 }
1639
1640 static u32
1641 pc_render_get_seqno(struct intel_engine_cs *engine)
1642 {
1643         return engine->scratch.cpu_page[0];
1644 }
1645
1646 static void
1647 pc_render_set_seqno(struct intel_engine_cs *engine, u32 seqno)
1648 {
1649         engine->scratch.cpu_page[0] = seqno;
1650 }
1651
1652 static bool
1653 gen5_ring_get_irq(struct intel_engine_cs *engine)
1654 {
1655         struct drm_i915_private *dev_priv = engine->i915;
1656         unsigned long flags;
1657
1658         if (WARN_ON(!intel_irqs_enabled(dev_priv)))
1659                 return false;
1660
1661         spin_lock_irqsave(&dev_priv->irq_lock, flags);
1662         if (engine->irq_refcount++ == 0)
1663                 gen5_enable_gt_irq(dev_priv, engine->irq_enable_mask);
1664         spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
1665
1666         return true;
1667 }
1668
1669 static void
1670 gen5_ring_put_irq(struct intel_engine_cs *engine)
1671 {
1672         struct drm_i915_private *dev_priv = engine->i915;
1673         unsigned long flags;
1674
1675         spin_lock_irqsave(&dev_priv->irq_lock, flags);
1676         if (--engine->irq_refcount == 0)
1677                 gen5_disable_gt_irq(dev_priv, engine->irq_enable_mask);
1678         spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
1679 }
1680
1681 static bool
1682 i9xx_ring_get_irq(struct intel_engine_cs *engine)
1683 {
1684         struct drm_i915_private *dev_priv = engine->i915;
1685         unsigned long flags;
1686
1687         if (!intel_irqs_enabled(dev_priv))
1688                 return false;
1689
1690         spin_lock_irqsave(&dev_priv->irq_lock, flags);
1691         if (engine->irq_refcount++ == 0) {
1692                 dev_priv->irq_mask &= ~engine->irq_enable_mask;
1693                 I915_WRITE(IMR, dev_priv->irq_mask);
1694                 POSTING_READ(IMR);
1695         }
1696         spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
1697
1698         return true;
1699 }
1700
1701 static void
1702 i9xx_ring_put_irq(struct intel_engine_cs *engine)
1703 {
1704         struct drm_i915_private *dev_priv = engine->i915;
1705         unsigned long flags;
1706
1707         spin_lock_irqsave(&dev_priv->irq_lock, flags);
1708         if (--engine->irq_refcount == 0) {
1709                 dev_priv->irq_mask |= engine->irq_enable_mask;
1710                 I915_WRITE(IMR, dev_priv->irq_mask);
1711                 POSTING_READ(IMR);
1712         }
1713         spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
1714 }
1715
1716 static bool
1717 i8xx_ring_get_irq(struct intel_engine_cs *engine)
1718 {
1719         struct drm_i915_private *dev_priv = engine->i915;
1720         unsigned long flags;
1721
1722         if (!intel_irqs_enabled(dev_priv))
1723                 return false;
1724
1725         spin_lock_irqsave(&dev_priv->irq_lock, flags);
1726         if (engine->irq_refcount++ == 0) {
1727                 dev_priv->irq_mask &= ~engine->irq_enable_mask;
1728                 I915_WRITE16(IMR, dev_priv->irq_mask);
1729                 POSTING_READ16(IMR);
1730         }
1731         spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
1732
1733         return true;
1734 }
1735
1736 static void
1737 i8xx_ring_put_irq(struct intel_engine_cs *engine)
1738 {
1739         struct drm_i915_private *dev_priv = engine->i915;
1740         unsigned long flags;
1741
1742         spin_lock_irqsave(&dev_priv->irq_lock, flags);
1743         if (--engine->irq_refcount == 0) {
1744                 dev_priv->irq_mask |= engine->irq_enable_mask;
1745                 I915_WRITE16(IMR, dev_priv->irq_mask);
1746                 POSTING_READ16(IMR);
1747         }
1748         spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
1749 }
1750
1751 static int
1752 bsd_ring_flush(struct drm_i915_gem_request *req,
1753                u32     invalidate_domains,
1754                u32     flush_domains)
1755 {
1756         struct intel_engine_cs *engine = req->engine;
1757         int ret;
1758
1759         ret = intel_ring_begin(req, 2);
1760         if (ret)
1761                 return ret;
1762
1763         intel_ring_emit(engine, MI_FLUSH);
1764         intel_ring_emit(engine, MI_NOOP);
1765         intel_ring_advance(engine);
1766         return 0;
1767 }
1768
1769 static int
1770 i9xx_add_request(struct drm_i915_gem_request *req)
1771 {
1772         struct intel_engine_cs *engine = req->engine;
1773         int ret;
1774
1775         ret = intel_ring_begin(req, 4);
1776         if (ret)
1777                 return ret;
1778
1779         intel_ring_emit(engine, MI_STORE_DWORD_INDEX);
1780         intel_ring_emit(engine,
1781                         I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT);
1782         intel_ring_emit(engine, i915_gem_request_get_seqno(req));
1783         intel_ring_emit(engine, MI_USER_INTERRUPT);
1784         __intel_ring_advance(engine);
1785
1786         return 0;
1787 }
1788
1789 static bool
1790 gen6_ring_get_irq(struct intel_engine_cs *engine)
1791 {
1792         struct drm_i915_private *dev_priv = engine->i915;
1793         unsigned long flags;
1794
1795         if (WARN_ON(!intel_irqs_enabled(dev_priv)))
1796                 return false;
1797
1798         spin_lock_irqsave(&dev_priv->irq_lock, flags);
1799         if (engine->irq_refcount++ == 0) {
1800                 if (HAS_L3_DPF(dev_priv) && engine->id == RCS)
1801                         I915_WRITE_IMR(engine,
1802                                        ~(engine->irq_enable_mask |
1803                                          GT_PARITY_ERROR(dev_priv)));
1804                 else
1805                         I915_WRITE_IMR(engine, ~engine->irq_enable_mask);
1806                 gen5_enable_gt_irq(dev_priv, engine->irq_enable_mask);
1807         }
1808         spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
1809
1810         return true;
1811 }
1812
1813 static void
1814 gen6_ring_put_irq(struct intel_engine_cs *engine)
1815 {
1816         struct drm_i915_private *dev_priv = engine->i915;
1817         unsigned long flags;
1818
1819         spin_lock_irqsave(&dev_priv->irq_lock, flags);
1820         if (--engine->irq_refcount == 0) {
1821                 if (HAS_L3_DPF(dev_priv) && engine->id == RCS)
1822                         I915_WRITE_IMR(engine, ~GT_PARITY_ERROR(dev_priv));
1823                 else
1824                         I915_WRITE_IMR(engine, ~0);
1825                 gen5_disable_gt_irq(dev_priv, engine->irq_enable_mask);
1826         }
1827         spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
1828 }
1829
1830 static bool
1831 hsw_vebox_get_irq(struct intel_engine_cs *engine)
1832 {
1833         struct drm_i915_private *dev_priv = engine->i915;
1834         unsigned long flags;
1835
1836         if (WARN_ON(!intel_irqs_enabled(dev_priv)))
1837                 return false;
1838
1839         spin_lock_irqsave(&dev_priv->irq_lock, flags);
1840         if (engine->irq_refcount++ == 0) {
1841                 I915_WRITE_IMR(engine, ~engine->irq_enable_mask);
1842                 gen6_enable_pm_irq(dev_priv, engine->irq_enable_mask);
1843         }
1844         spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
1845
1846         return true;
1847 }
1848
1849 static void
1850 hsw_vebox_put_irq(struct intel_engine_cs *engine)
1851 {
1852         struct drm_i915_private *dev_priv = engine->i915;
1853         unsigned long flags;
1854
1855         spin_lock_irqsave(&dev_priv->irq_lock, flags);
1856         if (--engine->irq_refcount == 0) {
1857                 I915_WRITE_IMR(engine, ~0);
1858                 gen6_disable_pm_irq(dev_priv, engine->irq_enable_mask);
1859         }
1860         spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
1861 }
1862
1863 static bool
1864 gen8_ring_get_irq(struct intel_engine_cs *engine)
1865 {
1866         struct drm_i915_private *dev_priv = engine->i915;
1867         unsigned long flags;
1868
1869         if (WARN_ON(!intel_irqs_enabled(dev_priv)))
1870                 return false;
1871
1872         spin_lock_irqsave(&dev_priv->irq_lock, flags);
1873         if (engine->irq_refcount++ == 0) {
1874                 if (HAS_L3_DPF(dev_priv) && engine->id == RCS) {
1875                         I915_WRITE_IMR(engine,
1876                                        ~(engine->irq_enable_mask |
1877                                          GT_RENDER_L3_PARITY_ERROR_INTERRUPT));
1878                 } else {
1879                         I915_WRITE_IMR(engine, ~engine->irq_enable_mask);
1880                 }
1881                 POSTING_READ(RING_IMR(engine->mmio_base));
1882         }
1883         spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
1884
1885         return true;
1886 }
1887
1888 static void
1889 gen8_ring_put_irq(struct intel_engine_cs *engine)
1890 {
1891         struct drm_i915_private *dev_priv = engine->i915;
1892         unsigned long flags;
1893
1894         spin_lock_irqsave(&dev_priv->irq_lock, flags);
1895         if (--engine->irq_refcount == 0) {
1896                 if (HAS_L3_DPF(dev_priv) && engine->id == RCS) {
1897                         I915_WRITE_IMR(engine,
1898                                        ~GT_RENDER_L3_PARITY_ERROR_INTERRUPT);
1899                 } else {
1900                         I915_WRITE_IMR(engine, ~0);
1901                 }
1902                 POSTING_READ(RING_IMR(engine->mmio_base));
1903         }
1904         spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
1905 }
1906
1907 static int
1908 i965_dispatch_execbuffer(struct drm_i915_gem_request *req,
1909                          u64 offset, u32 length,
1910                          unsigned dispatch_flags)
1911 {
1912         struct intel_engine_cs *engine = req->engine;
1913         int ret;
1914
1915         ret = intel_ring_begin(req, 2);
1916         if (ret)
1917                 return ret;
1918
1919         intel_ring_emit(engine,
1920                         MI_BATCH_BUFFER_START |
1921                         MI_BATCH_GTT |
1922                         (dispatch_flags & I915_DISPATCH_SECURE ?
1923                          0 : MI_BATCH_NON_SECURE_I965));
1924         intel_ring_emit(engine, offset);
1925         intel_ring_advance(engine);
1926
1927         return 0;
1928 }
1929
1930 /* Just userspace ABI convention to limit the wa batch bo to a resonable size */
1931 #define I830_BATCH_LIMIT (256*1024)
1932 #define I830_TLB_ENTRIES (2)
1933 #define I830_WA_SIZE max(I830_TLB_ENTRIES*4096, I830_BATCH_LIMIT)
1934 static int
1935 i830_dispatch_execbuffer(struct drm_i915_gem_request *req,
1936                          u64 offset, u32 len,
1937                          unsigned dispatch_flags)
1938 {
1939         struct intel_engine_cs *engine = req->engine;
1940         u32 cs_offset = engine->scratch.gtt_offset;
1941         int ret;
1942
1943         ret = intel_ring_begin(req, 6);
1944         if (ret)
1945                 return ret;
1946
1947         /* Evict the invalid PTE TLBs */
1948         intel_ring_emit(engine, COLOR_BLT_CMD | BLT_WRITE_RGBA);
1949         intel_ring_emit(engine, BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096);
1950         intel_ring_emit(engine, I830_TLB_ENTRIES << 16 | 4); /* load each page */
1951         intel_ring_emit(engine, cs_offset);
1952         intel_ring_emit(engine, 0xdeadbeef);
1953         intel_ring_emit(engine, MI_NOOP);
1954         intel_ring_advance(engine);
1955
1956         if ((dispatch_flags & I915_DISPATCH_PINNED) == 0) {
1957                 if (len > I830_BATCH_LIMIT)
1958                         return -ENOSPC;
1959
1960                 ret = intel_ring_begin(req, 6 + 2);
1961                 if (ret)
1962                         return ret;
1963
1964                 /* Blit the batch (which has now all relocs applied) to the
1965                  * stable batch scratch bo area (so that the CS never
1966                  * stumbles over its tlb invalidation bug) ...
1967                  */
1968                 intel_ring_emit(engine, SRC_COPY_BLT_CMD | BLT_WRITE_RGBA);
1969                 intel_ring_emit(engine,
1970                                 BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096);
1971                 intel_ring_emit(engine, DIV_ROUND_UP(len, 4096) << 16 | 4096);
1972                 intel_ring_emit(engine, cs_offset);
1973                 intel_ring_emit(engine, 4096);
1974                 intel_ring_emit(engine, offset);
1975
1976                 intel_ring_emit(engine, MI_FLUSH);
1977                 intel_ring_emit(engine, MI_NOOP);
1978                 intel_ring_advance(engine);
1979
1980                 /* ... and execute it. */
1981                 offset = cs_offset;
1982         }
1983
1984         ret = intel_ring_begin(req, 2);
1985         if (ret)
1986                 return ret;
1987
1988         intel_ring_emit(engine, MI_BATCH_BUFFER_START | MI_BATCH_GTT);
1989         intel_ring_emit(engine, offset | (dispatch_flags & I915_DISPATCH_SECURE ?
1990                                           0 : MI_BATCH_NON_SECURE));
1991         intel_ring_advance(engine);
1992
1993         return 0;
1994 }
1995
1996 static int
1997 i915_dispatch_execbuffer(struct drm_i915_gem_request *req,
1998                          u64 offset, u32 len,
1999                          unsigned dispatch_flags)
2000 {
2001         struct intel_engine_cs *engine = req->engine;
2002         int ret;
2003
2004         ret = intel_ring_begin(req, 2);
2005         if (ret)
2006                 return ret;
2007
2008         intel_ring_emit(engine, MI_BATCH_BUFFER_START | MI_BATCH_GTT);
2009         intel_ring_emit(engine, offset | (dispatch_flags & I915_DISPATCH_SECURE ?
2010                                           0 : MI_BATCH_NON_SECURE));
2011         intel_ring_advance(engine);
2012
2013         return 0;
2014 }
2015
2016 static void cleanup_phys_status_page(struct intel_engine_cs *engine)
2017 {
2018         struct drm_i915_private *dev_priv = engine->i915;
2019
2020         if (!dev_priv->status_page_dmah)
2021                 return;
2022
2023         drm_pci_free(dev_priv->dev, dev_priv->status_page_dmah);
2024         engine->status_page.page_addr = NULL;
2025 }
2026
2027 static void cleanup_status_page(struct intel_engine_cs *engine)
2028 {
2029         struct drm_i915_gem_object *obj;
2030
2031         obj = engine->status_page.obj;
2032         if (obj == NULL)
2033                 return;
2034
2035         kunmap(sg_page(obj->pages->sgl));
2036         i915_gem_object_ggtt_unpin(obj);
2037         drm_gem_object_unreference(&obj->base);
2038         engine->status_page.obj = NULL;
2039 }
2040
2041 static int init_status_page(struct intel_engine_cs *engine)
2042 {
2043         struct drm_i915_gem_object *obj = engine->status_page.obj;
2044
2045         if (obj == NULL) {
2046                 unsigned flags;
2047                 int ret;
2048
2049                 obj = i915_gem_object_create(engine->i915->dev, 4096);
2050                 if (IS_ERR(obj)) {
2051                         DRM_ERROR("Failed to allocate status page\n");
2052                         return PTR_ERR(obj);
2053                 }
2054
2055                 ret = i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
2056                 if (ret)
2057                         goto err_unref;
2058
2059                 flags = 0;
2060                 if (!HAS_LLC(engine->i915))
2061                         /* On g33, we cannot place HWS above 256MiB, so
2062                          * restrict its pinning to the low mappable arena.
2063                          * Though this restriction is not documented for
2064                          * gen4, gen5, or byt, they also behave similarly
2065                          * and hang if the HWS is placed at the top of the
2066                          * GTT. To generalise, it appears that all !llc
2067                          * platforms have issues with us placing the HWS
2068                          * above the mappable region (even though we never
2069                          * actualy map it).
2070                          */
2071                         flags |= PIN_MAPPABLE;
2072                 ret = i915_gem_obj_ggtt_pin(obj, 4096, flags);
2073                 if (ret) {
2074 err_unref:
2075                         drm_gem_object_unreference(&obj->base);
2076                         return ret;
2077                 }
2078
2079                 engine->status_page.obj = obj;
2080         }
2081
2082         engine->status_page.gfx_addr = i915_gem_obj_ggtt_offset(obj);
2083         engine->status_page.page_addr = kmap(sg_page(obj->pages->sgl));
2084         memset(engine->status_page.page_addr, 0, PAGE_SIZE);
2085
2086         DRM_DEBUG_DRIVER("%s hws offset: 0x%08x\n",
2087                         engine->name, engine->status_page.gfx_addr);
2088
2089         return 0;
2090 }
2091
2092 static int init_phys_status_page(struct intel_engine_cs *engine)
2093 {
2094         struct drm_i915_private *dev_priv = engine->i915;
2095
2096         if (!dev_priv->status_page_dmah) {
2097                 dev_priv->status_page_dmah =
2098                         drm_pci_alloc(dev_priv->dev, PAGE_SIZE, PAGE_SIZE);
2099                 if (!dev_priv->status_page_dmah)
2100                         return -ENOMEM;
2101         }
2102
2103         engine->status_page.page_addr = dev_priv->status_page_dmah->vaddr;
2104         memset(engine->status_page.page_addr, 0, PAGE_SIZE);
2105
2106         return 0;
2107 }
2108
2109 void intel_unpin_ringbuffer_obj(struct intel_ringbuffer *ringbuf)
2110 {
2111         GEM_BUG_ON(ringbuf->vma == NULL);
2112         GEM_BUG_ON(ringbuf->virtual_start == NULL);
2113
2114         if (HAS_LLC(ringbuf->obj->base.dev) && !ringbuf->obj->stolen)
2115                 i915_gem_object_unpin_map(ringbuf->obj);
2116         else
2117                 i915_vma_unpin_iomap(ringbuf->vma);
2118         ringbuf->virtual_start = NULL;
2119
2120         i915_gem_object_ggtt_unpin(ringbuf->obj);
2121         ringbuf->vma = NULL;
2122 }
2123
2124 int intel_pin_and_map_ringbuffer_obj(struct drm_i915_private *dev_priv,
2125                                      struct intel_ringbuffer *ringbuf)
2126 {
2127         struct drm_i915_gem_object *obj = ringbuf->obj;
2128         /* Ring wraparound at offset 0 sometimes hangs. No idea why. */
2129         unsigned flags = PIN_OFFSET_BIAS | 4096;
2130         void *addr;
2131         int ret;
2132
2133         if (HAS_LLC(dev_priv) && !obj->stolen) {
2134                 ret = i915_gem_obj_ggtt_pin(obj, PAGE_SIZE, flags);
2135                 if (ret)
2136                         return ret;
2137
2138                 ret = i915_gem_object_set_to_cpu_domain(obj, true);
2139                 if (ret)
2140                         goto err_unpin;
2141
2142                 addr = i915_gem_object_pin_map(obj);
2143                 if (IS_ERR(addr)) {
2144                         ret = PTR_ERR(addr);
2145                         goto err_unpin;
2146                 }
2147         } else {
2148                 ret = i915_gem_obj_ggtt_pin(obj, PAGE_SIZE,
2149                                             flags | PIN_MAPPABLE);
2150                 if (ret)
2151                         return ret;
2152
2153                 ret = i915_gem_object_set_to_gtt_domain(obj, true);
2154                 if (ret)
2155                         goto err_unpin;
2156
2157                 /* Access through the GTT requires the device to be awake. */
2158                 assert_rpm_wakelock_held(dev_priv);
2159
2160                 addr = i915_vma_pin_iomap(i915_gem_obj_to_ggtt(obj));
2161                 if (IS_ERR(addr)) {
2162                         ret = PTR_ERR(addr);
2163                         goto err_unpin;
2164                 }
2165         }
2166
2167         ringbuf->virtual_start = addr;
2168         ringbuf->vma = i915_gem_obj_to_ggtt(obj);
2169         return 0;
2170
2171 err_unpin:
2172         i915_gem_object_ggtt_unpin(obj);
2173         return ret;
2174 }
2175
2176 static void intel_destroy_ringbuffer_obj(struct intel_ringbuffer *ringbuf)
2177 {
2178         drm_gem_object_unreference(&ringbuf->obj->base);
2179         ringbuf->obj = NULL;
2180 }
2181
2182 static int intel_alloc_ringbuffer_obj(struct drm_device *dev,
2183                                       struct intel_ringbuffer *ringbuf)
2184 {
2185         struct drm_i915_gem_object *obj;
2186
2187         obj = NULL;
2188         if (!HAS_LLC(dev))
2189                 obj = i915_gem_object_create_stolen(dev, ringbuf->size);
2190         if (obj == NULL)
2191                 obj = i915_gem_object_create(dev, ringbuf->size);
2192         if (IS_ERR(obj))
2193                 return PTR_ERR(obj);
2194
2195         /* mark ring buffers as read-only from GPU side by default */
2196         obj->gt_ro = 1;
2197
2198         ringbuf->obj = obj;
2199
2200         return 0;
2201 }
2202
2203 struct intel_ringbuffer *
2204 intel_engine_create_ringbuffer(struct intel_engine_cs *engine, int size)
2205 {
2206         struct intel_ringbuffer *ring;
2207         int ret;
2208
2209         ring = kzalloc(sizeof(*ring), GFP_KERNEL);
2210         if (ring == NULL) {
2211                 DRM_DEBUG_DRIVER("Failed to allocate ringbuffer %s\n",
2212                                  engine->name);
2213                 return ERR_PTR(-ENOMEM);
2214         }
2215
2216         ring->engine = engine;
2217         list_add(&ring->link, &engine->buffers);
2218
2219         ring->size = size;
2220         /* Workaround an erratum on the i830 which causes a hang if
2221          * the TAIL pointer points to within the last 2 cachelines
2222          * of the buffer.
2223          */
2224         ring->effective_size = size;
2225         if (IS_I830(engine->i915) || IS_845G(engine->i915))
2226                 ring->effective_size -= 2 * CACHELINE_BYTES;
2227
2228         ring->last_retired_head = -1;
2229         intel_ring_update_space(ring);
2230
2231         ret = intel_alloc_ringbuffer_obj(engine->i915->dev, ring);
2232         if (ret) {
2233                 DRM_DEBUG_DRIVER("Failed to allocate ringbuffer %s: %d\n",
2234                                  engine->name, ret);
2235                 list_del(&ring->link);
2236                 kfree(ring);
2237                 return ERR_PTR(ret);
2238         }
2239
2240         return ring;
2241 }
2242
2243 void
2244 intel_ringbuffer_free(struct intel_ringbuffer *ring)
2245 {
2246         intel_destroy_ringbuffer_obj(ring);
2247         list_del(&ring->link);
2248         kfree(ring);
2249 }
2250
2251 static int intel_init_ring_buffer(struct drm_device *dev,
2252                                   struct intel_engine_cs *engine)
2253 {
2254         struct drm_i915_private *dev_priv = to_i915(dev);
2255         struct intel_ringbuffer *ringbuf;
2256         int ret;
2257
2258         WARN_ON(engine->buffer);
2259
2260         engine->i915 = dev_priv;
2261         INIT_LIST_HEAD(&engine->active_list);
2262         INIT_LIST_HEAD(&engine->request_list);
2263         INIT_LIST_HEAD(&engine->execlist_queue);
2264         INIT_LIST_HEAD(&engine->buffers);
2265         i915_gem_batch_pool_init(dev, &engine->batch_pool);
2266         memset(engine->semaphore.sync_seqno, 0,
2267                sizeof(engine->semaphore.sync_seqno));
2268
2269         init_waitqueue_head(&engine->irq_queue);
2270
2271         ringbuf = intel_engine_create_ringbuffer(engine, 32 * PAGE_SIZE);
2272         if (IS_ERR(ringbuf)) {
2273                 ret = PTR_ERR(ringbuf);
2274                 goto error;
2275         }
2276         engine->buffer = ringbuf;
2277
2278         if (I915_NEED_GFX_HWS(dev_priv)) {
2279                 ret = init_status_page(engine);
2280                 if (ret)
2281                         goto error;
2282         } else {
2283                 WARN_ON(engine->id != RCS);
2284                 ret = init_phys_status_page(engine);
2285                 if (ret)
2286                         goto error;
2287         }
2288
2289         ret = intel_pin_and_map_ringbuffer_obj(dev_priv, ringbuf);
2290         if (ret) {
2291                 DRM_ERROR("Failed to pin and map ringbuffer %s: %d\n",
2292                                 engine->name, ret);
2293                 intel_destroy_ringbuffer_obj(ringbuf);
2294                 goto error;
2295         }
2296
2297         ret = i915_cmd_parser_init_ring(engine);
2298         if (ret)
2299                 goto error;
2300
2301         return 0;
2302
2303 error:
2304         intel_cleanup_engine(engine);
2305         return ret;
2306 }
2307
2308 void intel_cleanup_engine(struct intel_engine_cs *engine)
2309 {
2310         struct drm_i915_private *dev_priv;
2311
2312         if (!intel_engine_initialized(engine))
2313                 return;
2314
2315         dev_priv = engine->i915;
2316
2317         if (engine->buffer) {
2318                 intel_stop_engine(engine);
2319                 WARN_ON(!IS_GEN2(dev_priv) && (I915_READ_MODE(engine) & MODE_IDLE) == 0);
2320
2321                 intel_unpin_ringbuffer_obj(engine->buffer);
2322                 intel_ringbuffer_free(engine->buffer);
2323                 engine->buffer = NULL;
2324         }
2325
2326         if (engine->cleanup)
2327                 engine->cleanup(engine);
2328
2329         if (I915_NEED_GFX_HWS(dev_priv)) {
2330                 cleanup_status_page(engine);
2331         } else {
2332                 WARN_ON(engine->id != RCS);
2333                 cleanup_phys_status_page(engine);
2334         }
2335
2336         i915_cmd_parser_fini_ring(engine);
2337         i915_gem_batch_pool_fini(&engine->batch_pool);
2338         engine->i915 = NULL;
2339 }
2340
2341 int intel_engine_idle(struct intel_engine_cs *engine)
2342 {
2343         struct drm_i915_gem_request *req;
2344
2345         /* Wait upon the last request to be completed */
2346         if (list_empty(&engine->request_list))
2347                 return 0;
2348
2349         req = list_entry(engine->request_list.prev,
2350                          struct drm_i915_gem_request,
2351                          list);
2352
2353         /* Make sure we do not trigger any retires */
2354         return __i915_wait_request(req,
2355                                    req->i915->mm.interruptible,
2356                                    NULL, NULL);
2357 }
2358
2359 int intel_ring_alloc_request_extras(struct drm_i915_gem_request *request)
2360 {
2361         int ret;
2362
2363         /* Flush enough space to reduce the likelihood of waiting after
2364          * we start building the request - in which case we will just
2365          * have to repeat work.
2366          */
2367         request->reserved_space += LEGACY_REQUEST_SIZE;
2368
2369         request->ringbuf = request->engine->buffer;
2370
2371         ret = intel_ring_begin(request, 0);
2372         if (ret)
2373                 return ret;
2374
2375         request->reserved_space -= LEGACY_REQUEST_SIZE;
2376         return 0;
2377 }
2378
2379 static int wait_for_space(struct drm_i915_gem_request *req, int bytes)
2380 {
2381         struct intel_ringbuffer *ringbuf = req->ringbuf;
2382         struct intel_engine_cs *engine = req->engine;
2383         struct drm_i915_gem_request *target;
2384
2385         intel_ring_update_space(ringbuf);
2386         if (ringbuf->space >= bytes)
2387                 return 0;
2388
2389         /*
2390          * Space is reserved in the ringbuffer for finalising the request,
2391          * as that cannot be allowed to fail. During request finalisation,
2392          * reserved_space is set to 0 to stop the overallocation and the
2393          * assumption is that then we never need to wait (which has the
2394          * risk of failing with EINTR).
2395          *
2396          * See also i915_gem_request_alloc() and i915_add_request().
2397          */
2398         GEM_BUG_ON(!req->reserved_space);
2399
2400         list_for_each_entry(target, &engine->request_list, list) {
2401                 unsigned space;
2402
2403                 /*
2404                  * The request queue is per-engine, so can contain requests
2405                  * from multiple ringbuffers. Here, we must ignore any that
2406                  * aren't from the ringbuffer we're considering.
2407                  */
2408                 if (target->ringbuf != ringbuf)
2409                         continue;
2410
2411                 /* Would completion of this request free enough space? */
2412                 space = __intel_ring_space(target->postfix, ringbuf->tail,
2413                                            ringbuf->size);
2414                 if (space >= bytes)
2415                         break;
2416         }
2417
2418         if (WARN_ON(&target->list == &engine->request_list))
2419                 return -ENOSPC;
2420
2421         return i915_wait_request(target);
2422 }
2423
2424 int intel_ring_begin(struct drm_i915_gem_request *req, int num_dwords)
2425 {
2426         struct intel_ringbuffer *ringbuf = req->ringbuf;
2427         int remain_actual = ringbuf->size - ringbuf->tail;
2428         int remain_usable = ringbuf->effective_size - ringbuf->tail;
2429         int bytes = num_dwords * sizeof(u32);
2430         int total_bytes, wait_bytes;
2431         bool need_wrap = false;
2432
2433         total_bytes = bytes + req->reserved_space;
2434
2435         if (unlikely(bytes > remain_usable)) {
2436                 /*
2437                  * Not enough space for the basic request. So need to flush
2438                  * out the remainder and then wait for base + reserved.
2439                  */
2440                 wait_bytes = remain_actual + total_bytes;
2441                 need_wrap = true;
2442         } else if (unlikely(total_bytes > remain_usable)) {
2443                 /*
2444                  * The base request will fit but the reserved space
2445                  * falls off the end. So we don't need an immediate wrap
2446                  * and only need to effectively wait for the reserved
2447                  * size space from the start of ringbuffer.
2448                  */
2449                 wait_bytes = remain_actual + req->reserved_space;
2450         } else {
2451                 /* No wrapping required, just waiting. */
2452                 wait_bytes = total_bytes;
2453         }
2454
2455         if (wait_bytes > ringbuf->space) {
2456                 int ret = wait_for_space(req, wait_bytes);
2457                 if (unlikely(ret))
2458                         return ret;
2459
2460                 intel_ring_update_space(ringbuf);
2461                 if (unlikely(ringbuf->space < wait_bytes))
2462                         return -EAGAIN;
2463         }
2464
2465         if (unlikely(need_wrap)) {
2466                 GEM_BUG_ON(remain_actual > ringbuf->space);
2467                 GEM_BUG_ON(ringbuf->tail + remain_actual > ringbuf->size);
2468
2469                 /* Fill the tail with MI_NOOP */
2470                 memset(ringbuf->virtual_start + ringbuf->tail,
2471                        0, remain_actual);
2472                 ringbuf->tail = 0;
2473                 ringbuf->space -= remain_actual;
2474         }
2475
2476         ringbuf->space -= bytes;
2477         GEM_BUG_ON(ringbuf->space < 0);
2478         return 0;
2479 }
2480
2481 /* Align the ring tail to a cacheline boundary */
2482 int intel_ring_cacheline_align(struct drm_i915_gem_request *req)
2483 {
2484         struct intel_engine_cs *engine = req->engine;
2485         int num_dwords = (engine->buffer->tail & (CACHELINE_BYTES - 1)) / sizeof(uint32_t);
2486         int ret;
2487
2488         if (num_dwords == 0)
2489                 return 0;
2490
2491         num_dwords = CACHELINE_BYTES / sizeof(uint32_t) - num_dwords;
2492         ret = intel_ring_begin(req, num_dwords);
2493         if (ret)
2494                 return ret;
2495
2496         while (num_dwords--)
2497                 intel_ring_emit(engine, MI_NOOP);
2498
2499         intel_ring_advance(engine);
2500
2501         return 0;
2502 }
2503
2504 void intel_ring_init_seqno(struct intel_engine_cs *engine, u32 seqno)
2505 {
2506         struct drm_i915_private *dev_priv = engine->i915;
2507
2508         /* Our semaphore implementation is strictly monotonic (i.e. we proceed
2509          * so long as the semaphore value in the register/page is greater
2510          * than the sync value), so whenever we reset the seqno,
2511          * so long as we reset the tracking semaphore value to 0, it will
2512          * always be before the next request's seqno. If we don't reset
2513          * the semaphore value, then when the seqno moves backwards all
2514          * future waits will complete instantly (causing rendering corruption).
2515          */
2516         if (IS_GEN6(dev_priv) || IS_GEN7(dev_priv)) {
2517                 I915_WRITE(RING_SYNC_0(engine->mmio_base), 0);
2518                 I915_WRITE(RING_SYNC_1(engine->mmio_base), 0);
2519                 if (HAS_VEBOX(dev_priv))
2520                         I915_WRITE(RING_SYNC_2(engine->mmio_base), 0);
2521         }
2522         if (dev_priv->semaphore_obj) {
2523                 struct drm_i915_gem_object *obj = dev_priv->semaphore_obj;
2524                 struct page *page = i915_gem_object_get_dirty_page(obj, 0);
2525                 void *semaphores = kmap(page);
2526                 memset(semaphores + GEN8_SEMAPHORE_OFFSET(engine->id, 0),
2527                        0, I915_NUM_ENGINES * gen8_semaphore_seqno_size);
2528                 kunmap(page);
2529         }
2530         memset(engine->semaphore.sync_seqno, 0,
2531                sizeof(engine->semaphore.sync_seqno));
2532
2533         engine->set_seqno(engine, seqno);
2534         engine->last_submitted_seqno = seqno;
2535
2536         engine->hangcheck.seqno = seqno;
2537 }
2538
2539 static void gen6_bsd_ring_write_tail(struct intel_engine_cs *engine,
2540                                      u32 value)
2541 {
2542         struct drm_i915_private *dev_priv = engine->i915;
2543
2544        /* Every tail move must follow the sequence below */
2545
2546         /* Disable notification that the ring is IDLE. The GT
2547          * will then assume that it is busy and bring it out of rc6.
2548          */
2549         I915_WRITE(GEN6_BSD_SLEEP_PSMI_CONTROL,
2550                    _MASKED_BIT_ENABLE(GEN6_BSD_SLEEP_MSG_DISABLE));
2551
2552         /* Clear the context id. Here be magic! */
2553         I915_WRITE64(GEN6_BSD_RNCID, 0x0);
2554
2555         /* Wait for the ring not to be idle, i.e. for it to wake up. */
2556         if (wait_for((I915_READ(GEN6_BSD_SLEEP_PSMI_CONTROL) &
2557                       GEN6_BSD_SLEEP_INDICATOR) == 0,
2558                      50))
2559                 DRM_ERROR("timed out waiting for the BSD ring to wake up\n");
2560
2561         /* Now that the ring is fully powered up, update the tail */
2562         I915_WRITE_TAIL(engine, value);
2563         POSTING_READ(RING_TAIL(engine->mmio_base));
2564
2565         /* Let the ring send IDLE messages to the GT again,
2566          * and so let it sleep to conserve power when idle.
2567          */
2568         I915_WRITE(GEN6_BSD_SLEEP_PSMI_CONTROL,
2569                    _MASKED_BIT_DISABLE(GEN6_BSD_SLEEP_MSG_DISABLE));
2570 }
2571
2572 static int gen6_bsd_ring_flush(struct drm_i915_gem_request *req,
2573                                u32 invalidate, u32 flush)
2574 {
2575         struct intel_engine_cs *engine = req->engine;
2576         uint32_t cmd;
2577         int ret;
2578
2579         ret = intel_ring_begin(req, 4);
2580         if (ret)
2581                 return ret;
2582
2583         cmd = MI_FLUSH_DW;
2584         if (INTEL_GEN(req->i915) >= 8)
2585                 cmd += 1;
2586
2587         /* We always require a command barrier so that subsequent
2588          * commands, such as breadcrumb interrupts, are strictly ordered
2589          * wrt the contents of the write cache being flushed to memory
2590          * (and thus being coherent from the CPU).
2591          */
2592         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
2593
2594         /*
2595          * Bspec vol 1c.5 - video engine command streamer:
2596          * "If ENABLED, all TLBs will be invalidated once the flush
2597          * operation is complete. This bit is only valid when the
2598          * Post-Sync Operation field is a value of 1h or 3h."
2599          */
2600         if (invalidate & I915_GEM_GPU_DOMAINS)
2601                 cmd |= MI_INVALIDATE_TLB | MI_INVALIDATE_BSD;
2602
2603         intel_ring_emit(engine, cmd);
2604         intel_ring_emit(engine,
2605                         I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT);
2606         if (INTEL_GEN(req->i915) >= 8) {
2607                 intel_ring_emit(engine, 0); /* upper addr */
2608                 intel_ring_emit(engine, 0); /* value */
2609         } else  {
2610                 intel_ring_emit(engine, 0);
2611                 intel_ring_emit(engine, MI_NOOP);
2612         }
2613         intel_ring_advance(engine);
2614         return 0;
2615 }
2616
2617 static int
2618 gen8_ring_dispatch_execbuffer(struct drm_i915_gem_request *req,
2619                               u64 offset, u32 len,
2620                               unsigned dispatch_flags)
2621 {
2622         struct intel_engine_cs *engine = req->engine;
2623         bool ppgtt = USES_PPGTT(engine->dev) &&
2624                         !(dispatch_flags & I915_DISPATCH_SECURE);
2625         int ret;
2626
2627         ret = intel_ring_begin(req, 4);
2628         if (ret)
2629                 return ret;
2630
2631         /* FIXME(BDW): Address space and security selectors. */
2632         intel_ring_emit(engine, MI_BATCH_BUFFER_START_GEN8 | (ppgtt<<8) |
2633                         (dispatch_flags & I915_DISPATCH_RS ?
2634                          MI_BATCH_RESOURCE_STREAMER : 0));
2635         intel_ring_emit(engine, lower_32_bits(offset));
2636         intel_ring_emit(engine, upper_32_bits(offset));
2637         intel_ring_emit(engine, MI_NOOP);
2638         intel_ring_advance(engine);
2639
2640         return 0;
2641 }
2642
2643 static int
2644 hsw_ring_dispatch_execbuffer(struct drm_i915_gem_request *req,
2645                              u64 offset, u32 len,
2646                              unsigned dispatch_flags)
2647 {
2648         struct intel_engine_cs *engine = req->engine;
2649         int ret;
2650
2651         ret = intel_ring_begin(req, 2);
2652         if (ret)
2653                 return ret;
2654
2655         intel_ring_emit(engine,
2656                         MI_BATCH_BUFFER_START |
2657                         (dispatch_flags & I915_DISPATCH_SECURE ?
2658                          0 : MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW) |
2659                         (dispatch_flags & I915_DISPATCH_RS ?
2660                          MI_BATCH_RESOURCE_STREAMER : 0));
2661         /* bit0-7 is the length on GEN6+ */
2662         intel_ring_emit(engine, offset);
2663         intel_ring_advance(engine);
2664
2665         return 0;
2666 }
2667
2668 static int
2669 gen6_ring_dispatch_execbuffer(struct drm_i915_gem_request *req,
2670                               u64 offset, u32 len,
2671                               unsigned dispatch_flags)
2672 {
2673         struct intel_engine_cs *engine = req->engine;
2674         int ret;
2675
2676         ret = intel_ring_begin(req, 2);
2677         if (ret)
2678                 return ret;
2679
2680         intel_ring_emit(engine,
2681                         MI_BATCH_BUFFER_START |
2682                         (dispatch_flags & I915_DISPATCH_SECURE ?
2683                          0 : MI_BATCH_NON_SECURE_I965));
2684         /* bit0-7 is the length on GEN6+ */
2685         intel_ring_emit(engine, offset);
2686         intel_ring_advance(engine);
2687
2688         return 0;
2689 }
2690
2691 /* Blitter support (SandyBridge+) */
2692
2693 static int gen6_ring_flush(struct drm_i915_gem_request *req,
2694                            u32 invalidate, u32 flush)
2695 {
2696         struct intel_engine_cs *engine = req->engine;
2697         uint32_t cmd;
2698         int ret;
2699
2700         ret = intel_ring_begin(req, 4);
2701         if (ret)
2702                 return ret;
2703
2704         cmd = MI_FLUSH_DW;
2705         if (INTEL_GEN(req->i915) >= 8)
2706                 cmd += 1;
2707
2708         /* We always require a command barrier so that subsequent
2709          * commands, such as breadcrumb interrupts, are strictly ordered
2710          * wrt the contents of the write cache being flushed to memory
2711          * (and thus being coherent from the CPU).
2712          */
2713         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
2714
2715         /*
2716          * Bspec vol 1c.3 - blitter engine command streamer:
2717          * "If ENABLED, all TLBs will be invalidated once the flush
2718          * operation is complete. This bit is only valid when the
2719          * Post-Sync Operation field is a value of 1h or 3h."
2720          */
2721         if (invalidate & I915_GEM_DOMAIN_RENDER)
2722                 cmd |= MI_INVALIDATE_TLB;
2723         intel_ring_emit(engine, cmd);
2724         intel_ring_emit(engine,
2725                         I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT);
2726         if (INTEL_GEN(req->i915) >= 8) {
2727                 intel_ring_emit(engine, 0); /* upper addr */
2728                 intel_ring_emit(engine, 0); /* value */
2729         } else  {
2730                 intel_ring_emit(engine, 0);
2731                 intel_ring_emit(engine, MI_NOOP);
2732         }
2733         intel_ring_advance(engine);
2734
2735         return 0;
2736 }
2737
2738 int intel_init_render_ring_buffer(struct drm_device *dev)
2739 {
2740         struct drm_i915_private *dev_priv = dev->dev_private;
2741         struct intel_engine_cs *engine = &dev_priv->engine[RCS];
2742         struct drm_i915_gem_object *obj;
2743         int ret;
2744
2745         engine->name = "render ring";
2746         engine->id = RCS;
2747         engine->exec_id = I915_EXEC_RENDER;
2748         engine->hw_id = 0;
2749         engine->mmio_base = RENDER_RING_BASE;
2750
2751         if (INTEL_GEN(dev_priv) >= 8) {
2752                 if (i915_semaphore_is_enabled(dev_priv)) {
2753                         obj = i915_gem_object_create(dev, 4096);
2754                         if (IS_ERR(obj)) {
2755                                 DRM_ERROR("Failed to allocate semaphore bo. Disabling semaphores\n");
2756                                 i915.semaphores = 0;
2757                         } else {
2758                                 i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
2759                                 ret = i915_gem_obj_ggtt_pin(obj, 0, PIN_NONBLOCK);
2760                                 if (ret != 0) {
2761                                         drm_gem_object_unreference(&obj->base);
2762                                         DRM_ERROR("Failed to pin semaphore bo. Disabling semaphores\n");
2763                                         i915.semaphores = 0;
2764                                 } else
2765                                         dev_priv->semaphore_obj = obj;
2766                         }
2767                 }
2768
2769                 engine->init_context = intel_rcs_ctx_init;
2770                 engine->add_request = gen8_render_add_request;
2771                 engine->flush = gen8_render_ring_flush;
2772                 engine->irq_get = gen8_ring_get_irq;
2773                 engine->irq_put = gen8_ring_put_irq;
2774                 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT;
2775                 engine->get_seqno = ring_get_seqno;
2776                 engine->set_seqno = ring_set_seqno;
2777                 if (i915_semaphore_is_enabled(dev_priv)) {
2778                         WARN_ON(!dev_priv->semaphore_obj);
2779                         engine->semaphore.sync_to = gen8_ring_sync;
2780                         engine->semaphore.signal = gen8_rcs_signal;
2781                         GEN8_RING_SEMAPHORE_INIT(engine);
2782                 }
2783         } else if (INTEL_GEN(dev_priv) >= 6) {
2784                 engine->init_context = intel_rcs_ctx_init;
2785                 engine->add_request = gen6_add_request;
2786                 engine->flush = gen7_render_ring_flush;
2787                 if (IS_GEN6(dev_priv))
2788                         engine->flush = gen6_render_ring_flush;
2789                 engine->irq_get = gen6_ring_get_irq;
2790                 engine->irq_put = gen6_ring_put_irq;
2791                 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT;
2792                 engine->irq_seqno_barrier = gen6_seqno_barrier;
2793                 engine->get_seqno = ring_get_seqno;
2794                 engine->set_seqno = ring_set_seqno;
2795                 if (i915_semaphore_is_enabled(dev_priv)) {
2796                         engine->semaphore.sync_to = gen6_ring_sync;
2797                         engine->semaphore.signal = gen6_signal;
2798                         /*
2799                          * The current semaphore is only applied on pre-gen8
2800                          * platform.  And there is no VCS2 ring on the pre-gen8
2801                          * platform. So the semaphore between RCS and VCS2 is
2802                          * initialized as INVALID.  Gen8 will initialize the
2803                          * sema between VCS2 and RCS later.
2804                          */
2805                         engine->semaphore.mbox.wait[RCS] = MI_SEMAPHORE_SYNC_INVALID;
2806                         engine->semaphore.mbox.wait[VCS] = MI_SEMAPHORE_SYNC_RV;
2807                         engine->semaphore.mbox.wait[BCS] = MI_SEMAPHORE_SYNC_RB;
2808                         engine->semaphore.mbox.wait[VECS] = MI_SEMAPHORE_SYNC_RVE;
2809                         engine->semaphore.mbox.wait[VCS2] = MI_SEMAPHORE_SYNC_INVALID;
2810                         engine->semaphore.mbox.signal[RCS] = GEN6_NOSYNC;
2811                         engine->semaphore.mbox.signal[VCS] = GEN6_VRSYNC;
2812                         engine->semaphore.mbox.signal[BCS] = GEN6_BRSYNC;
2813                         engine->semaphore.mbox.signal[VECS] = GEN6_VERSYNC;
2814                         engine->semaphore.mbox.signal[VCS2] = GEN6_NOSYNC;
2815                 }
2816         } else if (IS_GEN5(dev_priv)) {
2817                 engine->add_request = pc_render_add_request;
2818                 engine->flush = gen4_render_ring_flush;
2819                 engine->get_seqno = pc_render_get_seqno;
2820                 engine->set_seqno = pc_render_set_seqno;
2821                 engine->irq_get = gen5_ring_get_irq;
2822                 engine->irq_put = gen5_ring_put_irq;
2823                 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT |
2824                                         GT_RENDER_PIPECTL_NOTIFY_INTERRUPT;
2825         } else {
2826                 engine->add_request = i9xx_add_request;
2827                 if (INTEL_GEN(dev_priv) < 4)
2828                         engine->flush = gen2_render_ring_flush;
2829                 else
2830                         engine->flush = gen4_render_ring_flush;
2831                 engine->get_seqno = ring_get_seqno;
2832                 engine->set_seqno = ring_set_seqno;
2833                 if (IS_GEN2(dev_priv)) {
2834                         engine->irq_get = i8xx_ring_get_irq;
2835                         engine->irq_put = i8xx_ring_put_irq;
2836                 } else {
2837                         engine->irq_get = i9xx_ring_get_irq;
2838                         engine->irq_put = i9xx_ring_put_irq;
2839                 }
2840                 engine->irq_enable_mask = I915_USER_INTERRUPT;
2841         }
2842         engine->write_tail = ring_write_tail;
2843
2844         if (IS_HASWELL(dev_priv))
2845                 engine->dispatch_execbuffer = hsw_ring_dispatch_execbuffer;
2846         else if (IS_GEN8(dev_priv))
2847                 engine->dispatch_execbuffer = gen8_ring_dispatch_execbuffer;
2848         else if (INTEL_GEN(dev_priv) >= 6)
2849                 engine->dispatch_execbuffer = gen6_ring_dispatch_execbuffer;
2850         else if (INTEL_GEN(dev_priv) >= 4)
2851                 engine->dispatch_execbuffer = i965_dispatch_execbuffer;
2852         else if (IS_I830(dev_priv) || IS_845G(dev_priv))
2853                 engine->dispatch_execbuffer = i830_dispatch_execbuffer;
2854         else
2855                 engine->dispatch_execbuffer = i915_dispatch_execbuffer;
2856         engine->init_hw = init_render_ring;
2857         engine->cleanup = render_ring_cleanup;
2858
2859         /* Workaround batchbuffer to combat CS tlb bug. */
2860         if (HAS_BROKEN_CS_TLB(dev_priv)) {
2861                 obj = i915_gem_object_create(dev, I830_WA_SIZE);
2862                 if (IS_ERR(obj)) {
2863                         DRM_ERROR("Failed to allocate batch bo\n");
2864                         return PTR_ERR(obj);
2865                 }
2866
2867                 ret = i915_gem_obj_ggtt_pin(obj, 0, 0);
2868                 if (ret != 0) {
2869                         drm_gem_object_unreference(&obj->base);
2870                         DRM_ERROR("Failed to ping batch bo\n");
2871                         return ret;
2872                 }
2873
2874                 engine->scratch.obj = obj;
2875                 engine->scratch.gtt_offset = i915_gem_obj_ggtt_offset(obj);
2876         }
2877
2878         ret = intel_init_ring_buffer(dev, engine);
2879         if (ret)
2880                 return ret;
2881
2882         if (INTEL_GEN(dev_priv) >= 5) {
2883                 ret = intel_init_pipe_control(engine);
2884                 if (ret)
2885                         return ret;
2886         }
2887
2888         return 0;
2889 }
2890
2891 int intel_init_bsd_ring_buffer(struct drm_device *dev)
2892 {
2893         struct drm_i915_private *dev_priv = dev->dev_private;
2894         struct intel_engine_cs *engine = &dev_priv->engine[VCS];
2895
2896         engine->name = "bsd ring";
2897         engine->id = VCS;
2898         engine->exec_id = I915_EXEC_BSD;
2899         engine->hw_id = 1;
2900
2901         engine->write_tail = ring_write_tail;
2902         if (INTEL_GEN(dev_priv) >= 6) {
2903                 engine->mmio_base = GEN6_BSD_RING_BASE;
2904                 /* gen6 bsd needs a special wa for tail updates */
2905                 if (IS_GEN6(dev_priv))
2906                         engine->write_tail = gen6_bsd_ring_write_tail;
2907                 engine->flush = gen6_bsd_ring_flush;
2908                 engine->add_request = gen6_add_request;
2909                 engine->irq_seqno_barrier = gen6_seqno_barrier;
2910                 engine->get_seqno = ring_get_seqno;
2911                 engine->set_seqno = ring_set_seqno;
2912                 if (INTEL_GEN(dev_priv) >= 8) {
2913                         engine->irq_enable_mask =
2914                                 GT_RENDER_USER_INTERRUPT << GEN8_VCS1_IRQ_SHIFT;
2915                         engine->irq_get = gen8_ring_get_irq;
2916                         engine->irq_put = gen8_ring_put_irq;
2917                         engine->dispatch_execbuffer =
2918                                 gen8_ring_dispatch_execbuffer;
2919                         if (i915_semaphore_is_enabled(dev_priv)) {
2920                                 engine->semaphore.sync_to = gen8_ring_sync;
2921                                 engine->semaphore.signal = gen8_xcs_signal;
2922                                 GEN8_RING_SEMAPHORE_INIT(engine);
2923                         }
2924                 } else {
2925                         engine->irq_enable_mask = GT_BSD_USER_INTERRUPT;
2926                         engine->irq_get = gen6_ring_get_irq;
2927                         engine->irq_put = gen6_ring_put_irq;
2928                         engine->dispatch_execbuffer =
2929                                 gen6_ring_dispatch_execbuffer;
2930                         if (i915_semaphore_is_enabled(dev_priv)) {
2931                                 engine->semaphore.sync_to = gen6_ring_sync;
2932                                 engine->semaphore.signal = gen6_signal;
2933                                 engine->semaphore.mbox.wait[RCS] = MI_SEMAPHORE_SYNC_VR;
2934                                 engine->semaphore.mbox.wait[VCS] = MI_SEMAPHORE_SYNC_INVALID;
2935                                 engine->semaphore.mbox.wait[BCS] = MI_SEMAPHORE_SYNC_VB;
2936                                 engine->semaphore.mbox.wait[VECS] = MI_SEMAPHORE_SYNC_VVE;
2937                                 engine->semaphore.mbox.wait[VCS2] = MI_SEMAPHORE_SYNC_INVALID;
2938                                 engine->semaphore.mbox.signal[RCS] = GEN6_RVSYNC;
2939                                 engine->semaphore.mbox.signal[VCS] = GEN6_NOSYNC;
2940                                 engine->semaphore.mbox.signal[BCS] = GEN6_BVSYNC;
2941                                 engine->semaphore.mbox.signal[VECS] = GEN6_VEVSYNC;
2942                                 engine->semaphore.mbox.signal[VCS2] = GEN6_NOSYNC;
2943                         }
2944                 }
2945         } else {
2946                 engine->mmio_base = BSD_RING_BASE;
2947                 engine->flush = bsd_ring_flush;
2948                 engine->add_request = i9xx_add_request;
2949                 engine->get_seqno = ring_get_seqno;
2950                 engine->set_seqno = ring_set_seqno;
2951                 if (IS_GEN5(dev_priv)) {
2952                         engine->irq_enable_mask = ILK_BSD_USER_INTERRUPT;
2953                         engine->irq_get = gen5_ring_get_irq;
2954                         engine->irq_put = gen5_ring_put_irq;
2955                 } else {
2956                         engine->irq_enable_mask = I915_BSD_USER_INTERRUPT;
2957                         engine->irq_get = i9xx_ring_get_irq;
2958                         engine->irq_put = i9xx_ring_put_irq;
2959                 }
2960                 engine->dispatch_execbuffer = i965_dispatch_execbuffer;
2961         }
2962         engine->init_hw = init_ring_common;
2963
2964         return intel_init_ring_buffer(dev, engine);
2965 }
2966
2967 /**
2968  * Initialize the second BSD ring (eg. Broadwell GT3, Skylake GT3)
2969  */
2970 int intel_init_bsd2_ring_buffer(struct drm_device *dev)
2971 {
2972         struct drm_i915_private *dev_priv = dev->dev_private;
2973         struct intel_engine_cs *engine = &dev_priv->engine[VCS2];
2974
2975         engine->name = "bsd2 ring";
2976         engine->id = VCS2;
2977         engine->exec_id = I915_EXEC_BSD;
2978         engine->hw_id = 4;
2979
2980         engine->write_tail = ring_write_tail;
2981         engine->mmio_base = GEN8_BSD2_RING_BASE;
2982         engine->flush = gen6_bsd_ring_flush;
2983         engine->add_request = gen6_add_request;
2984         engine->irq_seqno_barrier = gen6_seqno_barrier;
2985         engine->get_seqno = ring_get_seqno;
2986         engine->set_seqno = ring_set_seqno;
2987         engine->irq_enable_mask =
2988                         GT_RENDER_USER_INTERRUPT << GEN8_VCS2_IRQ_SHIFT;
2989         engine->irq_get = gen8_ring_get_irq;
2990         engine->irq_put = gen8_ring_put_irq;
2991         engine->dispatch_execbuffer =
2992                         gen8_ring_dispatch_execbuffer;
2993         if (i915_semaphore_is_enabled(dev_priv)) {
2994                 engine->semaphore.sync_to = gen8_ring_sync;
2995                 engine->semaphore.signal = gen8_xcs_signal;
2996                 GEN8_RING_SEMAPHORE_INIT(engine);
2997         }
2998         engine->init_hw = init_ring_common;
2999
3000         return intel_init_ring_buffer(dev, engine);
3001 }
3002
3003 int intel_init_blt_ring_buffer(struct drm_device *dev)
3004 {
3005         struct drm_i915_private *dev_priv = dev->dev_private;
3006         struct intel_engine_cs *engine = &dev_priv->engine[BCS];
3007
3008         engine->name = "blitter ring";
3009         engine->id = BCS;
3010         engine->exec_id = I915_EXEC_BLT;
3011         engine->hw_id = 2;
3012
3013         engine->mmio_base = BLT_RING_BASE;
3014         engine->write_tail = ring_write_tail;
3015         engine->flush = gen6_ring_flush;
3016         engine->add_request = gen6_add_request;
3017         engine->irq_seqno_barrier = gen6_seqno_barrier;
3018         engine->get_seqno = ring_get_seqno;
3019         engine->set_seqno = ring_set_seqno;
3020         if (INTEL_GEN(dev_priv) >= 8) {
3021                 engine->irq_enable_mask =
3022                         GT_RENDER_USER_INTERRUPT << GEN8_BCS_IRQ_SHIFT;
3023                 engine->irq_get = gen8_ring_get_irq;
3024                 engine->irq_put = gen8_ring_put_irq;
3025                 engine->dispatch_execbuffer = gen8_ring_dispatch_execbuffer;
3026                 if (i915_semaphore_is_enabled(dev_priv)) {
3027                         engine->semaphore.sync_to = gen8_ring_sync;
3028                         engine->semaphore.signal = gen8_xcs_signal;
3029                         GEN8_RING_SEMAPHORE_INIT(engine);
3030                 }
3031         } else {
3032                 engine->irq_enable_mask = GT_BLT_USER_INTERRUPT;
3033                 engine->irq_get = gen6_ring_get_irq;
3034                 engine->irq_put = gen6_ring_put_irq;
3035                 engine->dispatch_execbuffer = gen6_ring_dispatch_execbuffer;
3036                 if (i915_semaphore_is_enabled(dev_priv)) {
3037                         engine->semaphore.signal = gen6_signal;
3038                         engine->semaphore.sync_to = gen6_ring_sync;
3039                         /*
3040                          * The current semaphore is only applied on pre-gen8
3041                          * platform.  And there is no VCS2 ring on the pre-gen8
3042                          * platform. So the semaphore between BCS and VCS2 is
3043                          * initialized as INVALID.  Gen8 will initialize the
3044                          * sema between BCS and VCS2 later.
3045                          */
3046                         engine->semaphore.mbox.wait[RCS] = MI_SEMAPHORE_SYNC_BR;
3047                         engine->semaphore.mbox.wait[VCS] = MI_SEMAPHORE_SYNC_BV;
3048                         engine->semaphore.mbox.wait[BCS] = MI_SEMAPHORE_SYNC_INVALID;
3049                         engine->semaphore.mbox.wait[VECS] = MI_SEMAPHORE_SYNC_BVE;
3050                         engine->semaphore.mbox.wait[VCS2] = MI_SEMAPHORE_SYNC_INVALID;
3051                         engine->semaphore.mbox.signal[RCS] = GEN6_RBSYNC;
3052                         engine->semaphore.mbox.signal[VCS] = GEN6_VBSYNC;
3053                         engine->semaphore.mbox.signal[BCS] = GEN6_NOSYNC;
3054                         engine->semaphore.mbox.signal[VECS] = GEN6_VEBSYNC;
3055                         engine->semaphore.mbox.signal[VCS2] = GEN6_NOSYNC;
3056                 }
3057         }
3058         engine->init_hw = init_ring_common;
3059
3060         return intel_init_ring_buffer(dev, engine);
3061 }
3062
3063 int intel_init_vebox_ring_buffer(struct drm_device *dev)
3064 {
3065         struct drm_i915_private *dev_priv = dev->dev_private;
3066         struct intel_engine_cs *engine = &dev_priv->engine[VECS];
3067
3068         engine->name = "video enhancement ring";
3069         engine->id = VECS;
3070         engine->exec_id = I915_EXEC_VEBOX;
3071         engine->hw_id = 3;
3072
3073         engine->mmio_base = VEBOX_RING_BASE;
3074         engine->write_tail = ring_write_tail;
3075         engine->flush = gen6_ring_flush;
3076         engine->add_request = gen6_add_request;
3077         engine->irq_seqno_barrier = gen6_seqno_barrier;
3078         engine->get_seqno = ring_get_seqno;
3079         engine->set_seqno = ring_set_seqno;
3080
3081         if (INTEL_GEN(dev_priv) >= 8) {
3082                 engine->irq_enable_mask =
3083                         GT_RENDER_USER_INTERRUPT << GEN8_VECS_IRQ_SHIFT;
3084                 engine->irq_get = gen8_ring_get_irq;
3085                 engine->irq_put = gen8_ring_put_irq;
3086                 engine->dispatch_execbuffer = gen8_ring_dispatch_execbuffer;
3087                 if (i915_semaphore_is_enabled(dev_priv)) {
3088                         engine->semaphore.sync_to = gen8_ring_sync;
3089                         engine->semaphore.signal = gen8_xcs_signal;
3090                         GEN8_RING_SEMAPHORE_INIT(engine);
3091                 }
3092         } else {
3093                 engine->irq_enable_mask = PM_VEBOX_USER_INTERRUPT;
3094                 engine->irq_get = hsw_vebox_get_irq;
3095                 engine->irq_put = hsw_vebox_put_irq;
3096                 engine->dispatch_execbuffer = gen6_ring_dispatch_execbuffer;
3097                 if (i915_semaphore_is_enabled(dev_priv)) {
3098                         engine->semaphore.sync_to = gen6_ring_sync;
3099                         engine->semaphore.signal = gen6_signal;
3100                         engine->semaphore.mbox.wait[RCS] = MI_SEMAPHORE_SYNC_VER;
3101                         engine->semaphore.mbox.wait[VCS] = MI_SEMAPHORE_SYNC_VEV;
3102                         engine->semaphore.mbox.wait[BCS] = MI_SEMAPHORE_SYNC_VEB;
3103                         engine->semaphore.mbox.wait[VECS] = MI_SEMAPHORE_SYNC_INVALID;
3104                         engine->semaphore.mbox.wait[VCS2] = MI_SEMAPHORE_SYNC_INVALID;
3105                         engine->semaphore.mbox.signal[RCS] = GEN6_RVESYNC;
3106                         engine->semaphore.mbox.signal[VCS] = GEN6_VVESYNC;
3107                         engine->semaphore.mbox.signal[BCS] = GEN6_BVESYNC;
3108                         engine->semaphore.mbox.signal[VECS] = GEN6_NOSYNC;
3109                         engine->semaphore.mbox.signal[VCS2] = GEN6_NOSYNC;
3110                 }
3111         }
3112         engine->init_hw = init_ring_common;
3113
3114         return intel_init_ring_buffer(dev, engine);
3115 }
3116
3117 int
3118 intel_ring_flush_all_caches(struct drm_i915_gem_request *req)
3119 {
3120         struct intel_engine_cs *engine = req->engine;
3121         int ret;
3122
3123         if (!engine->gpu_caches_dirty)
3124                 return 0;
3125
3126         ret = engine->flush(req, 0, I915_GEM_GPU_DOMAINS);
3127         if (ret)
3128                 return ret;
3129
3130         trace_i915_gem_ring_flush(req, 0, I915_GEM_GPU_DOMAINS);
3131
3132         engine->gpu_caches_dirty = false;
3133         return 0;
3134 }
3135
3136 int
3137 intel_ring_invalidate_all_caches(struct drm_i915_gem_request *req)
3138 {
3139         struct intel_engine_cs *engine = req->engine;
3140         uint32_t flush_domains;
3141         int ret;
3142
3143         flush_domains = 0;
3144         if (engine->gpu_caches_dirty)
3145                 flush_domains = I915_GEM_GPU_DOMAINS;
3146
3147         ret = engine->flush(req, I915_GEM_GPU_DOMAINS, flush_domains);
3148         if (ret)
3149                 return ret;
3150
3151         trace_i915_gem_ring_flush(req, I915_GEM_GPU_DOMAINS, flush_domains);
3152
3153         engine->gpu_caches_dirty = false;
3154         return 0;
3155 }
3156
3157 void
3158 intel_stop_engine(struct intel_engine_cs *engine)
3159 {
3160         int ret;
3161
3162         if (!intel_engine_initialized(engine))
3163                 return;
3164
3165         ret = intel_engine_idle(engine);
3166         if (ret)
3167                 DRM_ERROR("failed to quiesce %s whilst cleaning up: %d\n",
3168                           engine->name, ret);
3169
3170         stop_ring(engine);
3171 }