<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
    <channel>
        <title>GPU - Tag - Simi Studio</title>
        <link>/en/tags/gpu/</link>
        <description>GPU - Tag - Simi Studio</description>
        <generator>Hugo -- gohugo.io</generator><language>en</language><managingEditor>simi@simi.studio (Simi)</managingEditor>
            <webMaster>simi@simi.studio (Simi)</webMaster><lastBuildDate>Sat, 15 Jun 2024 10:00:00 &#43;0800</lastBuildDate><atom:link href="/en/tags/gpu/" rel="self" type="application/rss+xml" /><item>
    <title>vLLM in Action: How to Run Open-Source LLMs Efficiently on GPU Servers</title>
    <link>/en/posts/vllm-local-llm-serving/</link>
    <pubDate>Sat, 15 Jun 2024 10:00:00 &#43;0800</pubDate>
    <author>simi@simi.studio (Simi)</author>
    <guid>/en/posts/vllm-local-llm-serving/</guid>
    <description><![CDATA[vLLM is the most popular open-source LLM inference engine today. Its PagedAttention technology delivers 24x throughput improvement on the same hardware. This article explains what vLLM is, how to deploy it, and practical considerations.]]></description>
</item>
</channel>
</rss>
