[转]php sphinx 高效率搜索引擎搭建

原文地址：http://blog.csdn.net/adparking/article/details/7207119

1.下载PHP客户端安装:

http://pecl.php.net/package/sphinx

vim sphinxclient.c
找到
void sock_close ( int sock );
改为
static void sock_close ( int sock );

在php.ini加扩展,客户端安装完毕 2.安装sphinx,前提你已经安装mysql且安装了mysql-devel重源码安装mysql的

mysql-devel都已经安装,yum安装的运行

yum -y install mysql-devel

下载sphinx2.0.1地址:

http://sphinxsearch.com/downloads/

tar -xvzf sphinx-2.0.1-beta.tar.gz
cd sphinx-2.0.1-beta
./configure –prefix=/usr/local/sphinx –with-mysql –with-iconv

备注:64位增加参数 –enable-id64

make && make install cd /usr/local/sphinx/etc/ cp sphinx.conf.dist sphinx.conf

[plain]view plaincopy
 
#  

# Sphinx configuration file sample  
 
#  
 
# WARNING! While this sample file mentions all available options,  
 
# it contains (very) short helper descriptions only. Please refer to  
 
# doc/sphinx.html for details.  
 
#  
 
#############################################################################  
 
## data source definition  
 
#############################################################################  
 
source src1  
 
{  
 
    # data source type. mandatory, no default value  
 
    # known types are mysql, pgsql, mssql, xmlpipe, xmlpipe2, odbc  
 
    type            = mysql  
 
    #####################################################################  
 
    ## SQL settings (for ‘mysql’ and ‘pgsql’ types)  
 
    #####################################################################  
 
    # some straightforward parameters for SQL source types  
 
    sql_host        = localhost  
 
    sql_user        = root  
 
    sql_pass        = ******   
 
    sql_db            = ******  
 
    sql_port        = 3306    # optional, default is 3306  
 
    # UNIX socket name  
 
    # optional, default is empty (reuse client library defaults)  
 
    # usually ‘/var/lib/mysql/mysql.sock’ on Linux  
 
    # usually ‘/tmp/mysql.sock’ on FreeBSD  
 
    #  
 
    sql_sock        = /tmp/mysql.sock  
 
    # MySQL specific client connection flags  
 
    # optional, default is 0  
 
    # 数据传输方式  
 
    # mysql_connect_flags    = 32 # enable compression  
 
    # MySQL specific SSL certificate settings  
 
    # optional, defaults are empty  
 
    # SLL链接  
 
    # mysql_ssl_cert        = /etc/ssl/client-cert.pem  
 
    # mysql_ssl_key        = /etc/ssl/client-key.pem  
 
    # mysql_ssl_ca        = /etc/ssl/cacert.pem  
 
    # MS SQL specific Windows authentication mode flag  
 
    # MUST be in sync with charset_type index-level setting  
 
    # optional, default is 0  
 
    #  
 
    # mssql_winauth        = 1 # use currently logged on user credentials  
 
    # MS SQL specific Unicode indexing flag  
 
    # optional, default is 0 (request SBCS data)  
 
    #  
 
    # mssql_unicode        = 1 # request Unicode data from server  
 
    # ODBC specific DSN (data source name)  
 
    # mandatory for odbc source type, no default value  
 
    #  
 
    # odbc_dsn        = DBQ=C:\data;DefaultDir=C:\data;Driver={Microsoft Text Driver (*.txt; *.csv)};  
 
    # sql_query        = SELECT id, data FROM documents.csv  
 
    # ODBC and MS SQL specific, per-column buffer sizes  
 
    # optional, default is auto-detect  
 
    #  
 
    # sql_column_buffers    = content=12M, comments=1M  
 
    # pre-query, executed before the main fetch query  
 
    # multi-value, optional, default is empty list of queries  
 
    # 发送SQL语句前发送  
 
    sql_query_pre        = SET NAMES utf8  
 
    sql_query_pre        = SET SESSION query_cache_type=OFF  
 
    # main document fetch query  
 
    # mandatory, integer document ID field MUST be the first selected column  
 
    # 需要查询的表 构建查询  
 
    sql_query        = \  
 
        SELECT id,target_type,genre,stars,sub_title,sports_team,music_band,music_album \  
 
        FROM ko_link  
 
#如果多个数据源并要在一个索引,必须要保持字段的顺序数量跟数据都要一致,否则将出错  
 
    # joined/payload field fetch query  
 
    # joined fields let you avoid (slow) JOIN and GROUP_CONCAT  
 
    # payload fields let you attach custom per-keyword values (eg. for ranking)  
 
    #  
 
    # syntax is FIELD-NAME ‘from’  ( ‘query’ | ‘payload-query’ ); QUERY  
 
    # joined field QUERY should return 2 columns (docid, text)  
 
    # payload field QUERY should return 3 columns (docid, keyword, weight)  
 
    #  
 
    # REQUIRES that query results are in ascending docuent ID order!  
 
    # multi-value, optional, default is empty list of queries  
 
    #  添加字段,来源与表 自动连接  
 
# 字段结果集保持为  
 
# (1,tags1)  
 
# (1,tags2)  
 
# (2,tags3)  
 
# (2,tags4)  
 
# 添加字段将用于搜索,结果如有第3个字段,第3个字段表示该记录的权重,权重为大于1的值  
 
    # sql_joined_field    = tags from query; SELECT docid, CONCAT(‘tag’,tagid) FROM tags ORDER BY docid ASC  
 
    # sql_joined_field    = wtags from payload-query; SELECT docid, tag, tagweight FROM tags ORDER BY docid ASC  
 
    # file based field declaration  
 
    #  
 
    # content of this field is treated as a file name  
 
    # and the file gets loaded and indexed in place of a field  
 
    #  
 
    # max file size is limited by max_file_field_buffer indexer setting  
 
    # file IO errors are non-fatal and get reported as warnings  
 
    # 把字段声明放入文件  
 
    # sql_file_field        = content_file_path  
 
    # range query setup, query that must return min and max ID values  
 
    # optional, default is empty  
 
    #   
 
    # sql_query will need to reference $start and $end boundaries  
 
    # if using ranged query:  
 
    # 分区查询,防止MYSQL死锁  
 
    # sql_query        = \  
 
    #    SELECT doc.id, doc.id AS group, doc.title, doc.data \  
 
    #    FROM documents doc \  
 
    #    WHERE id>=$start AND id<=$end  
 
    #  
 
    # sql_query_range        = SELECT MIN(id),MAX(id) FROM documents  
 
    # range query step  
 
    # optional, default is 1024  
 
    # 分区查询跳步  
 
    # sql_range_step        = 1000  
 
    # unsigned integer attribute declaration  
 
    # multi-value (an arbitrary number of attributes is allowed), optional  
 
    # optional bit size can be specified, default is 32  
 
    # 声明无符号数字段  
 
    #sql_attr_uint        = target_type  
 
    # sql_attr_uint        = forum_id:9 # 9 bits for forum_id  
 
    #sql_attr_uint        = group_id  
 
    #声明BOOL字段  
 
    # boolean attribute declaration  
 
    # multi-value (an arbitrary number of attributes is allowed), optional  
 
    # equivalent to sql_attr_uint with 1-bit size  
 
    #  
 
    # sql_attr_bool        = is_deleted  
 
    # bigint attribute declaration  
 
    # multi-value (an arbitrary number of attributes is allowed), optional  
 
    # declares a signed (unlike uint!) 64-bit attribute  
 
    # 声明长整字段  
 
    # sql_attr_bigint        = my_bigint_id  
 
    # UNIX timestamp attribute declaration  
 
    # multi-value (an arbitrary number of attributes is allowed), optional  
 
    # similar to integer, but can also be used in date functions  
 
    # 声明时间字段  
 
    # sql_attr_timestamp    = posted_ts  
 
    # sql_attr_timestamp    = last_edited_ts  
 
    #sql_attr_timestamp    = date_added  
 
    # string ordinal attribute declaration  
 
    # multi-value (an arbitrary number of attributes is allowed), optional  
 
    # sorts strings (bytewise), and stores their indexes in the sorted list  
 
    # sorting by this attr is equivalent to sorting by the original strings  
 
    # 声明字符串字段 用于排序等,但此字段不会被存储  
 
    # sql_attr_str2ordinal    = author_name  
 
    # floating point attribute declaration  
 
    # multi-value (an arbitrary number of attributes is allowed), optional  
 
    # values are stored in single precision, 32-bit IEEE 754 format  
 
    # 声明浮点字段  
 
    # sql_attr_float        = lat_radians  
 
    # sql_attr_float        = long_radians  
 
    # multi-valued attribute (MVA) attribute declaration  
 
    # multi-value (an arbitrary number of attributes is allowed), optional  
 
    # MVA values are variable length lists of unsigned 32-bit integers  
 
    #  
 
    # syntax is ATTR-TYPE ATTR-NAME ‘from’ SOURCE-TYPE [;QUERY] [;RANGE-QUERY]  
 
    # ATTR-TYPE is ‘uint’ or ‘timestamp’  
 
    # SOURCE-TYPE is ‘field’, ‘query’, or ‘ranged-query’  
 
    # QUERY is SQL query used to fetch all ( docid, attrvalue ) pairs  
 
    # RANGE-QUERY is SQL query used to fetch min and max ID values, similar to ‘sql_query_range’  
 
    # 声明复合字段  
 
    # sql_attr_multi        = uint tag from query; SELECT docid, tagid FROM tags  
 
    # sql_attr_multi        = uint tag from ranged-query; \  
 
    #    SELECT docid, tagid FROM tags WHERE id>=$start AND id<=$end; \  
 
    #    SELECT MIN(docid), MAX(docid) FROM tags  
 
    # string attribute declaration  
 
    # multi-value (an arbitrary number of these is allowed), optional  
 
    # lets you store and retrieve strings  
 
    # 只是把数据存储,但不会索引改字段  
 
    # sql_attr_string        = stitle  
 
    # wordcount attribute declaration  
 
    # multi-value (an arbitrary number of these is allowed), optional  
 
    # lets you count the words at indexing time  
 
    # 将转化成关键字的字段,用于提高匹配率  
 
    # sql_attr_str2wordcount    = stitle  
 
    # combined field plus attribute declaration (from a single column)  
 
    # stores column as an attribute, but also indexes it as a full-text field  
 
    # 跟sql_attr_string不同是该属性加入索引  
 
    # sql_field_string    = author  
 
    # sql_field_str2wordcount    = title  
 
    # post-query, executed on sql_query completion  
 
    # optional, default is empty  
 
    # 取后查询  
 
    # sql_query_post        =  
 
    # post-index-query, executed on successful indexing completion  
 
    # optional, default is empty  
 
    # $maxid expands to max document ID actually fetched from DB  
 
    # 索引后查询  
 
    # sql_query_post_index    = REPLACE INTO counters ( id, val ) \  
 
    #    VALUES ( ‘max_indexed_id’, $maxid )  
 
    # ranged query throttling, in milliseconds  
 
    # optional, default is 0 which means no delay  
 
    # enforces given delay before each query step  
 
    #分区查询的时间间隔  
 
    sql_ranged_throttle    = 0  
 
    # document info query, ONLY for CLI search (ie. testing and debugging)  
 
    # optional, default is empty  
 
    # must contain $id macro and must fetch the document by that id  
 
    #命令行调试查询结果用  
 
    sql_query_info        = SELECT * FROM ko_link WHERE id=$id  
 
    # kill-list query, fetches the document IDs for kill-list  
 
    # k-list will suppress matches from preceding indexes in the same query  
 
    # optional, default is empty  
 
    ##清理指定查询ID列表,对于数据的更改  
 
    # sql_query_killlist    = SELECT id FROM documents WHERE edited>=@last_reindex  
 
    # columns to unpack on indexer side when indexing  
 
    # multi-value, optional, default is empty list  
 
    # 启用ZIP压缩 可以降低系统负载 但必须保证zlib库zlib-dev库可用  
 
    # unpack_zlib        = zlib_column  
 
    # unpack_mysqlcompress    = compressed_column  
 
    # unpack_mysqlcompress    = compressed_column_2  
 
    # maximum unpacked length allowed in MySQL COMPRESS() unpacker  
 
    # optional, default is 16M  
 
    # 压缩缓存区大小 不能小于字段存储值  
 
    # unpack_mysqlcompress_maxsize    = 16M  
 
    #####################################################################  
 
    ## xmlpipe2 配置  
 
    #####################################################################  
 
    # type            = xmlpipe  
 
    # shell command to invoke xmlpipe stream producer  
 
    # mandatory  
 
    #  
 
    # xmlpipe_command        = cat /usr/local/sphinx/var/test.xml  
 
    # xmlpipe2 field declaration  
 
    # multi-value, optional, default is empty  
 
    #  
 
    # xmlpipe_field        = subject  
 
    # xmlpipe_field        = content  
 
    # xmlpipe2 attribute declaration  
 
    # multi-value, optional, default is empty  
 
    # all xmlpipe_attr_XXX options are fully similar to sql_attr_XXX  
 
    #  
 
    # xmlpipe_attr_timestamp    = published  
 
    # xmlpipe_attr_uint    = author_id  
 
    # perform UTF-8 validation, and filter out incorrect codes  
 
    # avoids XML parser choking on non-UTF-8 documents  
 
    # optional, default is 0  
 
    #  
 
    # xmlpipe_fixup_utf8    = 1  
 
}  
 
# inherited source example  
 
# 继承数据源  
 
# all the parameters are copied from the parent source,  
 
# and may then be overridden in this source definition  
 
#source src1throttled : src1  
 
#{  
 
#    sql_ranged_throttle    = 100  
 
#}  
 
#############################################################################  
 
## index definition  
 
#############################################################################  
 
# local index example  
 
#  
 
# this is an index which is stored locally in the filesystem  
 
#  
 
# all indexing-time options (such as morphology and charsets)  
 
# are configured per local index  
 
index test1  
 
{  
 
    # index type  
 
    # optional, default is ‘plain’  
 
    # known values are ‘plain’, ‘distributed’, and ‘rt’ (see samples below)   
 
    #索引类型 本地 分布式   
 
    # type            = plain  
 
    # document source(s) to index  
 
    # multi-value, mandatory  
 
    # document IDs must be globally unique across all sources  
 
    #数据源,可以多个数据源  
 
    source            = src1  
 
    # index files path and file name, without extension  
 
    # mandatory, path must be writable, extensions will be auto-appended  
 
    # 索引保存路径  
 
    path            = /usr/local/sphinx/var/data/test1  
 
    # document attribute values (docinfo) storage mode  
 
    # optional, default is ‘extern’  
 
    # known values are ‘none’, ‘extern’ and ‘inline’  
 
    #索引存储方式  
 
    docinfo            = extern  
 
    # memory locking for cached data (.spa and .spi), to prevent swapping  
 
    # optional, default is 0 (do not mlock)  
 
    # requires searchd to be run from root  
 
    #内存锁定 需要保证足够权限  
 
    mlock            = 0  
 
    # a list of morphology preprocessors to apply  
 
    # optional, default is empty  
 
    #  
 
    # builtin preprocessors are ‘none’, ‘stem_en’, ‘stem_ru’, ‘stem_enru’,  
 
    # ‘soundex’, and ‘metaphone’; additional preprocessors available from  
 
    # libstemmer are ‘libstemmer_XXX’, where XXX is algorithm code  
 
    # (see libstemmer_c/libstemmer/modules.txt)  
 
    # 词语提取器  
 
    # morphology        = stem_en, stem_ru, soundex  
 
    # morphology        = libstemmer_german  
 
    # morphology        = libstemmer_sv  
 
    morphology        = stem_en  
 
    # minimum word length at which to enable stemming  
 
    # optional, default is 1 (stem everything)  
 
    # 词干化的最小词长  
 
    # min_stemming_len    = 1  
 
    # stopword files list (space separated)  
 
    # optional, default is empty  
 
    # contents are plain text, charset_table and stemming are both applied  
 
    # 停用搜索词  
 
    # stopwords        = /usr/local/sphinx/var/data/stopwords.txt  
 
    # wordforms file, in “mapfrom > mapto” plain text format  
 
    # optional, default is empty  
 
    # 词型字典 可用spelldump工具生成  
 
    # wordforms        = /usr/local/sphinx/var/data/wordforms.txt  
 
    # tokenizing exceptions file  
 
    # optional, default is empty  
 
    #Token特例文件,就是有些词是完整词意,不能拆分索引如a&t 跟a & t  
 
    # plain text, case sensitive, space insensitive in map-from part  
 
    # one “Map Several Words => ToASingleOne” entry per line  
 
    #  
 
    # exceptions        = /usr/local/sphinx/var/data/exceptions.txt  
 
    # minimum indexed word length  
 
    # default is 1 (index everything)  
 
    #  最小索引长度,就是小于指定长度的词不被索引  
 
    min_word_len        = 1  
 
    # charset encoding type  
 
    # optional, default is ‘sbcs’  
 
    # known types are ‘sbcs’ (Single Byte CharSet) and ‘utf-8’  
 
    # 字符编码  
 
    charset_type        = utf-8  
 
    # charset definition and case folding rules “table”  
 
    # optional, default value depends on charset_type  
 
    #  
 
    # defaults are configured to include English and Russian characters only  
 
    # you need to change the table to include additional ones  
 
    # this behavior MAY change in future versions  
 
    #  
 
    # ‘sbcs’ default value is  
 
    # charset_table        = 0..9, A..Z->a..z, _, a..z, U+A8->U+B8, U+B8, U+C0..U+DF->U+E0..U+FF, U+E0..U+FF  
 
    # 转换字符表  
 
    # ‘utf-8’ default value is  
 
    # charset_table        = 0..9, A..Z->a..z, _, a..z, U+410..U+42F->U+430..U+44F, U+430..U+44F  
 
    # ignored characters list  
 
    # optional, default value is empty  
 
    #  忽略字符表  
 
    # ignore_chars        = U+00AD  
 
    # minimum word prefix length to index  
 
    # optional, default is 0 (do not index prefixes)  
 
    #索引的最小前缀长度,小心使用,索引和搜索的时间皆会恶化  
 
    # min_prefix_len        = 0  
 
    # minimum word infix length to index  
 
    # optional, default is 0 (do not index infixes)  
 
    #索引的最小中缀长度 小心使用,索引和搜索的时间皆会恶化  
 
    # min_infix_len        = 0  
 
    # list of fields to limit prefix/infix indexing to  
 
    # optional, default value is empty (index all fields in prefix/infix mode)  
 
    # 未知  
 
    # prefix_fields        = filename  
 
    # infix_fields        = url, domain  
 
    # enable star-syntax (wildcards) when searching prefix/infix indexes  
 
    # search-time only, does not affect indexing, can be 0 or 1  
 
    # optional, default is 0 (do not use wildcard syntax)  
 
    # 启用星号语法  
 
    # enable_star        = 1  
 
    # expand keywords with exact forms and/or stars when searching fit indexes  
 
    # search-time only, does not affect indexing, can be 0 or 1  
 
    # optional, default is 0 (do not expand keywords)  
 
    # 扩大搜索关键字 形式如: running -> ( running | *running* | =running )  
 
    # expand_keywords        = 1  
 
    # n-gram length to index, for CJK indexing  
 
    # only supports 0 and 1 for now, other lengths to be implemented  
 
    # optional, default is 0 (disable n-grams)  
 
    # 中文等其他语言的基本支持  
 
    # ngram_len        = 1  
 
    # n-gram characters list, for CJK indexing  
 
    # optional, default is empty  
 
    #中文或其他语言的值范围  
 
    # ngram_chars        = U+3000..U+2FA1F  
 
    # phrase boundary characters list  
 
    # optional, default is empty  
 
    # 边界符  
 
    # phrase_boundary        = ., ?, !, U+2026 # horizontal ellipsis  
 
    # phrase boundary word position increment  
 
    # optional, default is 0  
 
    # 边界符增量  
 
    # phrase_boundary_step    = 100  
 
    # blended characters list  
 
    # blended chars are indexed both as separators and valid characters  
 
    # for instance, AT&T will results in 3 tokens (“at”, “t”, and “at&t”)  
 
    # optional, default is empty  
 
    # 混合字符列表  
 
    # blend_chars        = +, &, U+23  
 
    # blended token indexing mode  
 
    # a comma separated list of blended token indexing variants  
 
    # known variants are trim_none, trim_head, trim_tail, trim_both, skip_pure  
 
    # optional, default is trim_none  
 
    #未知  
 
    # blend_mode        = trim_tail, skip_pure  
 
    # whether to strip HTML tags from incoming documents  
 
    # known values are 0 (do not strip) and 1 (do strip)  
 
    # optional, default is 0  
 
    # 删除HTML标签 (小心文本被删除)  
 
    html_strip        = 0  
 
    # what HTML attributes to index if stripping HTML  
 
    # optional, default is empty (do not index anything)  
 
    # 保留的HTML标签  
 
    # html_index_attrs    = img=alt,title; a=title;  
 
    # what HTML elements contents to strip  
 
    # optional, default is empty (do not strip element contents)  
 
    # 不但删除标签,其包含的文本也将删除  
 
    # html_remove_elements    = style, script  
 
    # whether to preopen index data files on startup  
 
    # optional, default is 0 (do not preopen), searchd-only  
 
    # 预先打开索引还是每次查询的时候在打开索引  
 
    # preopen            = 1  
 
    # whether to keep dictionary (.spi) on disk, or cache it in RAM  
 
    # optional, default is 0 (cache in RAM), searchd-only  
 
    # 将字典文件是否保存在内存中  
 
    # ondisk_dict        = 1  
 
    # whether to enable in-place inversion (2x less disk, 90-95% speed)  
 
    # optional, default is 0 (use separate temporary files), indexer-only  
 
    # 是否启用原地索引倒转 将少磁盘使用 性能会有一点损失  
 
    # inplace_enable        = 1  
 
    # in-place fine-tuning options  
 
    # optional, defaults are listed below  
 
    #微调原地倒转  
 
    # inplace_hit_gap        = 0 # preallocated hitlist gap size   
 
    # inplace_docinfo_gap    = 0 # preallocated docinfo gap size  
 
    # inplace_reloc_factor    = 0.1 # relocation buffer size within arena  
 
    # inplace_write_factor    = 0.1 # write buffer size within arena  
 
    # whether to index original keywords along with stemmed versions  
 
    # enables “=exactform” operator to work  
 
    # optional, default is 0  
 
    # 是否在索引原关键词的词干化/重映射后的形式的同时也索引原词  
 
    # index_exact_words    = 1  
 
    # position increment on overshort (less that min_word_len) words  
 
    # optional, allowed values are 0 and 1, default is 1  
 
    #在经过过短的词（比 min_word_len短的词）处后增加位置值  
 
    # overshort_step        = 1  
 
    # position increment on stopword  
 
    # optional, allowed values are 0 and 1, default is 1  
 
    #在经过 停用词 处后增加位置值可选选项  
 
    # stopword_step        = 1  
 
    # hitless words list  
 
    # positions for these keywords will not be stored in the index  
 
    # optional, allowed values are ‘all’, or a list file name  
 
    # 不能中断的字符列表  
 
    # hitless_words        = all  
 
    # hitless_words        = hitless.txt #字符文件  
 
    # detect and index sentence and paragraph boundaries  
 
    # required for the SENTENCE and PARAGRAPH operators to work  
 
    # optional, allowed values are 0 and 1, default is 0  
 
    # 是否检查标签合并 针对HTML  
 
    # index_sp            = 1  
 
    # index zones, delimited by HTML/XML tags  
 
    # a comma separated list of tags and wildcards  
 
    # required for the ZONE operator to work  
 
    # optional, default is empty string (do not index zones)  
 
    # 对HTML标签的权重  
 
    # index_zones        = title, h*, th  
 
}  
 
# inherited index example  
 
# 索引继承  
 
# all the parameters are copied from the parent index,  
 
# and may then be overridden in this index definition  
 
#index test1stemmed : test1  
 
#{  
 
#    path            = /usr/local/sphinx/var/data/test1stemmed  
 
#    morphology        = stem_en  
 
#}  
 
# distributed index example  
 
#  
 
# this is a virtual index which can NOT be directly indexed,  
 
# and only contains references to other local and/or remote indexes  
 
#index dist1  
 
#{  
 
#分布式索引配置  
 
    # ‘distributed’ index type MUST be specified  
 
#    type            = distributed  
 
    # local index to be searched  
 
    # there can be many local indexes configured  
 
#    local            = test1  
 
#    local            = test1stemmed  
 
    # remote agent  
 
    # multiple remote agents may be specified  
 
    # syntax for TCP connections is ‘hostname:port:index1,[index2[,…]]’  
 
    # syntax for local UNIX connections is ‘/path/to/socket:index1,[index2[,…]]’  
 
#    agent            = localhost:9313:remote1  
 
#    agent            = localhost:9314:remote2,remote3  
 
    # agent            = /var/run/searchd.sock:remote4  
 
    # blackhole remote agent, for debugging/testing  
 
    # network errors and search results will be ignored  
 
    #  
 
    # agent_blackhole        = testbox:9312:testindex1,testindex2  
 
    # remote agent connection timeout, milliseconds  
 
    # optional, default is 1000 ms, ie. 1 sec  
 
#    agent_connect_timeout    = 1000  
 
    # remote agent query timeout, milliseconds  
 
    # optional, default is 3000 ms, ie. 3 sec  
 
#    agent_query_timeout    = 3000  
 
#}  
 
# realtime index example  
 
#  
 
# you can run INSERT, REPLACE, and DELETE on this index on the fly  
 
# using MySQL protocol (see ‘listen’ directive below)  
 
#index rt  
 
#{  
 
    # ‘rt’ index type must be specified to use RT index  
 
#    type            = rt  
 
    # index files path and file name, without extension  
 
    # mandatory, path must be writable, extensions will be auto-appended  
 
#    path            = /usr/local/sphinx/var/data/rt  
 
    # RAM chunk size limit  
 
    # RT index will keep at most this much data in RAM, then flush to disk  
 
    # optional, default is 32M  
 
    #  
 
    # rt_mem_limit        = 512M  
 
    # full-text field declaration  
 
    # multi-value, mandatory  
 
#    rt_field        = title  
 
#    rt_field        = content  
 
    # unsigned integer attribute declaration  
 
    # multi-value (an arbitrary number of attributes is allowed), optional  
 
    # declares an unsigned 32-bit attribute  
 
#    rt_attr_uint        = gid  
 
    # RT indexes currently support the following attribute types:  
 
    # uint, bigint, float, timestamp, string  
 
    #  
 
    # rt_attr_bigint        = guid  
 
    # rt_attr_float        = gpa  
 
    # rt_attr_timestamp    = ts_added  
 
    # rt_attr_string        = author  
 
#}  
 
#############################################################################  
 
## indexer settings  
 
#############################################################################  
 
indexer  
 
{  
 
    #索引过程内存使用限制。可选选项，默认32M。   
 
    # memory limit, in bytes, kiloytes (16384K) or megabytes (256M)  
 
    # optional, default is 32M, max is 2047M, recommended is 256M to 1024M  
 
    mem_limit        = 32M  
 
    # maximum IO calls per second (for I/O throttling)  
 
    # optional, default is 0 (unlimited)  
 
    # 每秒最大I/O操作次数，用于限制I/O操作。可选选项，默认为0（无限制）。   
 
    # max_iops        = 40  
 
    # maximum IO call size, bytes (for I/O throttling)  
 
    # optional, default is 0 (unlimited)  
 
    # 最大允许的I/O操作大小，以字节为单位，用于I/O节流。可选选项，默认为0（不限制）。   
 
    # max_iosize        = 1048576  
 
    # maximum xmlpipe2 field length, bytes  
 
    # optional, default is 2M  
 
    # 对于XMLLpipe2数据源允许的最大的域大小  
 
    # max_xmlpipe2_field    = 4M  
 
    # write buffer size, bytes  
 
    # several (currently up to 4) buffers will be allocated  
 
    # write buffers are allocated in addition to mem_limit  
 
    # optional, default is 1M  
 
    # 写缓冲区的大小，单位是字节。可选选项，默认值是1MB。   
 
    # write_buffer        = 1M  
 
    # maximum file field adaptive buffer size  
 
    # optional, default is 8M, minimum is 1M  
 
    #  
 
    # max_file_field_buffer    = 32M  
 
}  
 
#############################################################################  
 
## searchd settings  
 
#############################################################################  
 
searchd  
 
{  
 
    # [hostname:]port[:protocol], or /unix/socket/path to listen on  
 
    # known protocols are ‘sphinx’ (SphinxAPI) and ‘mysql41’ (SphinxQL)  
 
    #  
 
    # multi-value, multiple listen points are allowed  
 
    # optional, defaults are 9312:sphinx and 9306:mysql41, as below  
 
    #  
 
    # listen            = 127.0.0.1  
 
    # listen            = 192.168.0.1:9312  
 
    # listen            = 9312  
 
    # listen            = /var/run/searchd.sock  
 
    listen            = 9312  
 
    #listen            = 9306:mysql41  
 
    # log file, searchd run info is logged here  
 
    # optional, default is ‘searchd.log’  
 
    # 全部searchd运行时事件会被记录在这个日志文件中。   
 
    log            = /usr/local/sphinx/var/log/searchd.log  
 
    # query log file, all search queries are logged here  
 
    # optional, default is empty (do not log queries)  
 
    # 全部搜索查询会被记录在此文件中。  
 
    query_log        = /usr/local/sphinx/var/log/query.log  
 
    # client read timeout, seconds  
 
    # optional, default is 5  
 
    #网络客户端请求的读超时时间，单位是秒。  
 
    read_timeout        = 5  
 
    # request timeout, seconds  
 
    # optional, default is 5 minutes  
 
    #在使用持久连接时，两次查询之间等待的最长时间（单位是秒）。  
 
    client_timeout        = 300  
 
    # maximum amount of children to fork (concurrent searches to run)  
 
    # optional, default is 0 (unlimited)  
 
    #子进程的最大数量 ,用来控制服务器负载。任何时候不可能有比此设置值更多的搜索同时运行。当达到限制时，新的输入客户端会被用临时失败（SEARCH_RETRY）状态码驳回，同时给出一个声明服务器已到最大连接限制的消息。   
 
    max_children        = 30  
 
    # PID file, searchd process ID file name  
 
    # mandatory  
 
    #进程ID文件  
 
    pid_file        = /usr/local/sphinx/var/log/searchd.pid  
 
    # max amount of matches the daemon ever keeps in RAM, per-index  
 
    # WARNING, THERE’S ALSO PER-QUERY LIMIT, SEE SetLimits() API CALL  
 
    # default is 1000 (just like Google)  
 
    #守护进程在内存中为每个索引所保持并返回给客户端的匹配数目的最大值。  
 
    max_matches        = 1000  
 
    # seamless rotate, prevents rotate stalls if precaching huge datasets  
 
    # optional, default is 1  
 
    #防止 searchd 轮换在需要预取大量数据的索引时停止响应。可选选项，默认为1（启用无缝（seamless）轮换）。   
 
    seamless_rotate        = 1  
 
    # whether to forcibly preopen all indexes on startup  
 
    # optional, default is 1 (preopen everything)  
 
    #是否在启动是强制重新打开所有索引文件。可选选项，默认为0（不重新打开）。  
 
    preopen_indexes        = 1  
 
    # whether to unlink .old index copies on succesful rotation.  
 
    # optional, default is 1 (do unlink)  
 
    #索引轮换成功之后，是否删除以.old为扩展名的索引拷贝。可选选项，默认为1（删除这些索引拷贝）。   
 
    unlink_old        = 1  
 
    # attribute updates periodic flush timeout, seconds  
 
    # updates will be automatically dumped to disk this frequently  
 
    # optional, default is 0 (disable periodic flush)  
 
    # UpdateAttributes() 调用时候更新是否隔一段时间写入磁盘  
 
    # attr_flush_period    = 900  
 
    # instance-wide ondisk_dict defaults (per-index value take precedence)  
 
    # optional, default is 0 (precache all dictionaries in RAM)  
 
    #对 ondisk_dict 指令的全局的默认值。 可选选项，默认值是0（将字典预先缓冲到内存）。  
 
    # ondisk_dict_default    = 1  
 
    # MVA updates pool size  
 
    # shared between all instances of searchd, disables attr flushes!  
 
    # optional, default size is 1M  
 
    #网络通讯时允许的最大的包的大小。  
 
    mva_updates_pool    = 1M  
 
    # max allowed network packet size  
 
    # limits both query packets from clients, and responses from agents  
 
    # optional, default size is 8M  
 
    #用于多值属性MVA更新的存储空间的共享池大小。  
 
    max_packet_size        = 8M  
 
    # crash log path  
 
    # searchd will (try to) log crashed query to ‘crash_log_path.PID’ file  
 
    # optional, default is empty (do not create crash logs)  
 
    #崩溃日志文件的路径  
 
    # crash_log_path        = /usr/local/sphinx/var/log/crash  
 
    # max allowed per-query filter count  
 
    # optional, default is 256  
 
    #每次查询允许设置的过滤器的最大个数。只用于内部检查，不直接影响内存使用或性能。  
 
    max_filters        = 256  
 
    # max allowed per-filter values count  
 
    # optional, default is 4096  
 
    #单个过滤器允许的值的最大个数。只用于内部检查，不直接影响内存使用或性能。  
 
    max_filter_values    = 4096  
 
    # socket listen queue length  
 
    # optional, default is 5  
 
    #TCP监听积压列表长度。无法如对的请求立即失败并收到“连接被拒”错误信息  
 
    # listen_backlog        = 5  
 
    # per-keyword read buffer size  
 
    # optional, default is 256K  
 
    #每个关键字的读缓冲区的大小。可选选项，默认值是256K。   
 
    # read_buffer        = 256K  
 
    # unhinted read size (currently used when reading hits)  
 
    # optional, default is 32K  
 
    #无提示时读操作的大小。可选选项，默认值是32K。  
 
    # read_unhinted        = 32K  
 
    # max allowed per-batch query count (aka multi-query count)  
 
    # optional, default is 32  
 
    #限制每批次的查询量。一个OPEN之后的查询量  
 
    max_batch_queries    = 32  
 
    # max common subtree document cache size, per-query  
 
    # optional, default is 0 (disable subtree optimization)  
 
    #  
 
    # subtree_docs_cache    = 4M  
 
    # max common subtree hit cache size, per-query  
 
    # optional, default is 0 (disable subtree optimization)  
 
    # 限制RAM使用一个共同的子树优化 默认不优化  
 
    # subtree_hits_cache    = 8M  
 
    # multi-processing mode (MPM)  
 
    # known values are none, fork, prefork, and threads  
 
    # optional, default is fork  
 
    # 工作方式   
 
    workers            = threads # for RT to work  
 
    # max threads to create for searching local parts of a distributed index  
 
    # optional, default is 0, which means disable multi-threaded searching  
 
    # should work with all MPMs (ie. does NOT require workers=threads)  
 
    #    
 
    # dist_threads        = 4  
 
    # binlog files path; use empty string to disable binlog  
 
    # optional, default is build-time configured data directory  
 
    # 二进制日志路径  
 
    # binlog_path        = # disable logging  
 
    # binlog_path        = /usr/local/sphinx/var/data # binlog.001 etc will be created there  
 
    # binlog flush/sync mode  
 
    # 0 means flush and sync every second  
 
    # 1 means flush and sync every transaction  
 
    # 2 means flush every transaction, sync every second  
 
    # optional, default is 2  
 
    # 日志刷新模式  
 
    # binlog_flush        = 2  
 
    # binlog per-file size limit  
 
    # optional, default is 128M, 0 means no limit  
 
    #最大日志大小  
 
    # binlog_max_log_size    = 256M  
 
    # per-thread stack size, only affects workers=threads mode  
 
    # optional, default is 64K  
 
    #每个线程的堆栈大小。  
 
    # thread_stack            = 128K  
 
    # per-keyword expansion limit (for dict=keywords prefix searches)  
 
    # optional, default is 0 (no limit)  
 
    # 扩大为一个关键字的最大数目  
 
    # expansion_limit        = 1000  
 
    # RT RAM chunks flush period  
 
    # optional, default is 0 (no periodic flush)  
 
    #RT索引在内存中检查的时间  
 
    # rt_flush_period        = 900  
 
    # query log file format  
 
    # optional, known values are plain and sphinxql, default is plain  
 
    # 查询日志格式  
 
    # query_log_format        = sphinxql  
 
    # version string returned to MySQL network protocol clients  
 
    # optional, default is empty (use Sphinx version)  
 
    # MYSQL版本  
 
    # mysql_version_string    = 5.0.37  
 
    # trusted plugin directory  
 
    # optional, default is empty (disable UDFs)  
 
    # 插件目录  
 
    # plugin_dir            = /usr/local/sphinx/lib  
 
    # default server-wide collation  
 
    # optional, default is libc_ci  
 
    # 链接字符集  
 
    # collation_server        = utf8_general_ci  
 
    # server-wide locale for libc based collations  
 
    # optional, default is C  
 
    # collation 选项  
 
    # collation_libc_locale    = ru_RU.UTF-8  
 
    # threaded server watchdog (only used in workers=threads mode)  
 
    # optional, values are 0 and 1, default is 1 (watchdog on)  
 
    # 是否启用服务器监控进程  
 
    # watchdog                = 1  
 
    # SphinxQL compatibility mode (legacy columns and their names)  
 
    # optional, default is 0 (SQL compliant syntax and result sets)  
 
    #sphinxql 兼容模式  
 
    # compat_sphinxql_magics    = 1  
 
}  
 
# –eof–

建立索引:

/usr/local/sphinx/bin/indexer –config /usr/local/sphinx/etc/sphinx.conf index1

不停服务下索引：

/usr/local/sphinx/bin/indexer –config /usr/local/sphinx/etc/sphinx.conf –all –rotate

启动索引服务,使PHP的客户端可用

/usr/local/sphinx/bin/searchd –config /usr/local/sphinx/etc/sphinx.conf

建立PHP测试文件

<?php $s = new SphinxClient; setServer(“localhost”, 9312); $s->setMatchMode(SPH_MATCH_ANY); $s->setMaxQueryTime(3); $result = $s->query(“test”);#查询 print_r ($result); ?>

运行即可

searchd 命令:

强行停止:

searchd –config /home/myuser/sphinx.conf –stop

安静停止:

searchd –config /home/myuser/sphinx.conf –stopwait

状态:

searchd –config /home/myuser/sphinx.conf –status

指定PID文件

searchd –config /home/myuser/sphinx.conf –pidfile /home/myuser/sphinx.pid

启动的为控制台模式:

searchd –config /home/myuser/sphinx.conf –console

只启动指定索引

searchd –index myindex

生成一些字典的工具软件:spelldump

indextool 一些转移等工具的软件

搜索字符串规则:

* operator OR:
hello | world

* operator NOT:
hello -world
hello !world

* field search operator:
@title hello @body world

* field position limit modifier (introduced in version 0.9.9-rc1):
@body[50] hello

* multiple-field search operator:
@(title,body) hello world

* all-field search operator:
@* hello

* phrase search operator:
“hello world”

* proximity search operator:
“hello world”~10

* quorum matching operator:
“the world is a wonderful place”/3

* strict order operator (aka operator “before”):
aaa << bbb << ccc

* exact form modifier (introduced in version 0.9.9-rc1):
raining =cats and =dogs

* field-start and field-end modifier (introduced in version 0.9.9-rc2):
^hello world$

* NEAR, generalized proximity operator (introduced in version
2.0.1-beta):
hello NEAR/3 world NEAR/4 “my test”

* SENTENCE operator (introduced in version 2.0.1-beta):
all SENTENCE words SENTENCE “in one sentence”

* PARAGRAPH operator (introduced in version 2.0.1-beta):
“Bill Gates” PARAGRAPH “Steve Jobs”

* zone limit operator:
ZONE:(h3,h4) only in these titles

表达式,支持函数等日期用的是时间戳 (好像不是作为MYSQL存储引擎的时候用不到)

* Arithmetic operators: +, -, *, /, %, DIV, MOD
* Comparison operators: <, > <=, >=, =, <>
* Boolean operators: AND, OR, NOT
* Bitwise operators: &, |
* ABS()
* BIGINT()
* CEIL()
* COS()
* CRC32()
* DAY()
* EXP()
* FLOOR()
* GEODIST()
* IDIV()
* IF()
* IN()
* INTERVAL()
* LN()
* LOG10()
* LOG2()
* MAX()
* MIN()
* MONTH()
* NOW()
* POW()
* SIN()
* SINT()
* SQRT()
* YEAR()
* YEARMONTH()
* YEARMONTHDAY()

[php] view plaincopy
<?php   
  
include_once ‘sphinxapi.php’;  
$s = new SphinxClient();  
$s->setServer(“localhost”, 9312);  
$s->SetConnectTimeout ( 1 );//设置链接超时  
  
  
/* 
$s->AddQuery();//列表查询 
$s->RunQueries ();//执行列表查询 
$s->ResetFilters();//清除过滤条件 
$s->BuildExcerpts($docs, $index, $words);//生成简要 
$s->BuildKeywords($query, $index, $hits);//生成关键字 
$s->GetLastError();//错误 
$s->GetLastWarning();//警告 
$s->FlushAttributes();//索引刷入硬盘 
$s->IsConnectError();//链接错误 
$s->ResetGroupBy();//重设分组 
 
$s->SetFieldWeights(array(‘sub_title’=>1));//加权最小为1 
$s->SetIDRange($min, $max);//ID范围 
$s->SetIndexWeights(array(‘test1’=>1));//索引权重 
$s->Status();//服务是否可用 
$s->UpdateAttributes($index, $attrs, $values);//更新硬盘索引 
*/  
/* 
参考文档:http://www.coreseek.cn/docs/coreseek_4.1-sphinx_2.0.1-beta.html#matching-modes 
SPH_MATCH_ALL, matches all query words (default mode); 
SPH_MATCH_ANY, matches any of the query words; 
SPH_MATCH_PHRASE, matches query as a phrase, requiring perfect match; 
SPH_MATCH_BOOLEAN, matches query as a boolean expression (see Section 5.2, “Boolean query syntax”); 
SPH_MATCH_EXTENDED, matches query as an expression in Sphinx internal query language (see Section 5.3, “Extended query syntax”). As of 0.9.9, this has been superceded by SPH_MATCH_EXTENDED2, providing additional functionality and better performance. The ident is retained for legacy application code that will continue to be compatible once Sphinx and its components, including the API, are upgraded. 
SPH_MATCH_EXTENDED2, matches query using the second version of the Extended matching mode. 
SPH_MATCH_FULLSCAN, m 
*/  
$s->setMatchMode(SPH_MATCH_ANY);//匹配模式  
$s->setMaxQueryTime(3);//查询超时  
//$s->SetSelect ( $select );//设置返回的字段  
/* 
$cl->SetSelect ( “*, @weight+(user_karma+ln(pageviews))*0.1 AS myweight” ); 
$cl->SetSelect ( “exp_years, salary_gbp*{$gbp_usd_rate} AS salary_usd, 
   IF(age>40,1,0) AS over40″ ); 
$cl->SetSelect ( “*, AVG(price) AS avgprice” ); 
 */  
  
/* 
$cl->SetGroupBy ( “category”, SPH_GROUPBY_ATTR, “@count desc” ); 
$cl->SetGroupDistinct ( “vendor” ); 
== 
SELECT id, weight, all-attributes, 
    COUNT(DISTINCT vendor) AS @distinct, 
    COUNT(*) AS @count 
FROM products 
GROUP BY category 
ORDER BY @count DESC 
*/  
//$s->SetGroupBy ( $groupby, SPH_GROUPBY_ATTR, $groupsort );//汇总  
//$s->SetGroupDistinct ( $distinct );//设置不重复字段  
  
$s->SetArrayResult ( true );//结果是否有ID  
  
/* 
    SPH_SORT_RELEVANCE mode, that sorts by relevance in descending order (best matches first); 
    SPH_SORT_ATTR_DESC mode, that sorts by an attribute in descending order (bigger attribute values first); 
    SPH_SORT_ATTR_ASC mode, that sorts by an attribute in ascending order (smaller attribute values first); 
    SPH_SORT_TIME_SEGMENTS mode, that sorts by time segments (last hour/day/week/month) in descending order, and then by relevance in descending order; 
    SPH_SORT_EXTENDED mode, that sorts by SQL-like combination of columns in ASC/DESC order; 
    SPH_SORT_EXPR mode, that sorts by an arithmetic expression. 
 */  
//$s->SetSortMode ( SPH_SORT_EXTENDED, $sortby );//排序模式  
  
/* 
$s->SetOverride($attrname, $attrtype, $values); 
$s->ResetOverrides();*/  
/* 
$s->SetRetries($count);//设置失败重试 
$s->SetRankingMode($ranker);//设置排名模式 均适用于SPH_MATCH_EXTENDED2搜索 
 
//第3个参数当为true时，相当于$attribute!=$value，默认值是false 
$s->SetFilter ( ‘target_type’, $filtervals );//设置过滤,值列表 
$s->SetFilterFloatRange($attribute, $min, $max);//浮动范围 
$s->SetFilterRange($attribute, $min, $max);//指定范围 
$s->SetGeoAnchor($attrlat, $attrlong, $lat, $long); 
*/  
//link  
//$s->SetFilter ( ‘target_type’, array(1),true );  
  
  
$s->SetLimits ( 0, 10 );//显示数量:开始 量 最大量 右偏移量  
$result = $s->query(“good”,“team”);//查询  
  
  
print_r($result);  

Hessian's Blog

记录生活点滴，见证成长历程。

[转]php sphinx 高效率搜索引擎搭建

Like this:

发表回复取消回复

2025年 7月
一	二	三	四	五	六	日
	1	2	3	4	5	6
7	8	9	10	11	12	13
14	15	16	17	18	19	20
21	22	23	24	25	26	27
28	29	30	31

Share this:

Like this:

发表回复 取消回复

发表回复取消回复