前段时间打算建个垃圾英文站在测试谷歌收录情况,之前破解的胖鼠采集插件(当时用xyz域名建了个垃圾站,用的这个采集器,谷歌一直没收录,后来据说对xyz域名不友好)没法用了。选来选去最后还是选择了火车头采集器!
采集规则都是现学现用,还算简单的!建站文章反正就是采集中文文章然后利用翻译插件翻译成英文发布就行。。火车头可以自写插件去进行文章内容处理。。
后来就去找翻译的api,看到了个讯飞的api。免费调用用完了~我就拿出来供大家用吧。。。结合火车头的插件模板,找chatgpt写了个这样的插件!V1版本
<?php
error_reporting(E_ERROR | E_WARNING | E_PARSE);
class Its_test {
function tocurl($url, $header, $content) {
$ch = curl_init();
if(substr($url, 0, 5) == 'https') {
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_SSLVERSION, 1);
}
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_URL, $url);
if (is_array($header)) {
curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
}
curl_setopt($ch, CURLOPT_POST, true);
if (!empty($content)) {
curl_setopt($ch, CURLOPT_POSTFIELDS, is_array($content) ? http_build_query($content) : $content);
}
$response = curl_exec($ch);
$error = curl_error($ch);
if ($error) {
die($error);
}
curl_close($ch);
return $response;
}
function xfyun_translate($text) {
$app_id = "写自己的";
$api_sec = "写自己的";
$api_key = "写自己的";
$url = "https://itrans.xfyun.cn/v2/its";
$body = json_encode($this->getBody($app_id, "cn", "en", $text));
$date = gmdate('D, d M Y H:i:s') . ' GMT';
$digestBase64 = "SHA-256=" . base64_encode(hash("sha256", $body, true));
$builder = sprintf("host: %s\ndate: %s\nPOST /v2/its HTTP/1.1\ndigest: %s", "itrans.xfyun.cn", $date, $digestBase64);
$sha = base64_encode(hash_hmac("sha256", $builder, $api_sec, true));
$authorization = sprintf("api_key=\"%s\", algorithm=\"%s\", headers=\"%s\", signature=\"%s\"", $api_key, "hmac-sha256", "host date request-line digest", $sha);
$header = array(
"Authorization: " . $authorization,
'Content-Type: application/json',
'Accept: application/json,version=1.0',
'Host: itrans.xfyun.cn',
'Date: ' . $date,
'Digest: ' . $digestBase64
);
$response = $this->tocurl($url, $header, $body);
$result = json_decode($response, true);
return $result['data']['result']['trans_result']['dst']; // Return the translated text
}
function getBody($app_id, $from, $to, $text) {
return array(
'common' => array('app_id' => $app_id),
'business' => array('from' => $from, 'to' => $to),
'data' => array('text' => base64_encode($text))
);
}
}
$translator = new Its_test(); // Initialize translator class
// 处理标签数组
if ($LabelArray['PageType'] == "List") {
$LabelArray['Html'] = '当前页面的网址为:' . $LabelUrl . "\r\n页面类型为:" . $LabelArray['PageType'] . "\r\n接收到的数据是:" . $LabelArray['Html'];
} else if ($LabelArray['PageType'] == "Content") {
$LabelArray['Html'] = '当前页面的网址为:' . $LabelUrl . "\r\n页面类型为:" . $LabelArray['PageType'] . "\r\n接收到的数据是:" . $LabelArray['Html'];
} else if ($LabelArray['PageType'] == "Save") {
// 翻译标题和内容
$LabelArray['内容'] = $translator->xfyun_translate($LabelArray['内容']);
$LabelArray['标题'] = $translator->xfyun_translate($LabelArray['标题']);
// 处理内容
$LabelArray['内容'] = $LabelArray['内容'];
$LabelArray['内容'] = str_replace('旧字符串', '新字符串', $LabelArray['内容']);
$LabelArray['标题'] = $LabelArray['标题'];
$LabelArray['时间'] = date('Y-m-d H:i:s', time());
}
// 输出结果
echo serialize($LabelArray);
?>
用起来还行~但是超了5000字就出问题,后来发现用https://itrans.xf-yun.com/v1/its接口的这个api能处理的字数多一点~但是接口调用不同,然后再找GPT写了下面这个插件。V2版本
<?php
error_reporting(E_ERROR | E_WARNING | E_PARSE);
class MachineTranslation {
private $requestUrl = "https://itrans.xf-yun.com/v1/its";
private $APPID = "写自己的";
private $APISecret = "写自己的";
private $APIKey = "写自己的";
private $RES_ID = "its_en_cn_word";
private $FROM = "cn"; // 源语种
private $TO = "en"; // 目标语种
private $TEXT;
public function translateText($text) {
$this->TEXT = $text;
return $this->main();
}
public function main() {
$startTime = microtime(true);
try {
$resp = $this->doRequest();
$respArray = json_decode($resp, true);
$textBase64Decode = base64_decode($respArray['payload']['result']['text']);
$textArray = json_decode($textBase64Decode, true);
return $textArray['trans_result']['dst']; // 返回翻译后的文本
} catch (Exception $e) {
return "Error: " . $e->getMessage();
}
}
private function doRequest() {
$url = $this->buildRequetUrl();
$params = $this->buildParam();
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, array('Content-Type: application/json'));
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, $params);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
$response = curl_exec($ch);
if (curl_errno($ch)) {
throw new Exception("Curl error: " . curl_error($ch));
}
curl_close($ch);
return $response;
}
private function buildRequetUrl() {
$url = parse_url($this->requestUrl);
$date = gmdate("D, d M Y H:i:s T");
$host = $url['host'];
$request_line = "POST " . $url['path'] . " HTTP/1.1";
$string_to_sign = "host: $host\n" . "date: $date\n" . $request_line;
$signature = base64_encode(hash_hmac('sha256', $string_to_sign, $this->APISecret, true));
$authorization = sprintf('api_key="%s", algorithm="hmac-sha256", headers="host date request-line", signature="%s"', $this->APIKey, $signature);
$authBase = base64_encode($authorization);
return sprintf("%s?authorization=%s&host=%s&date=%s",
$this->requestUrl,
urlencode($authBase),
urlencode($host),
urlencode($date)
);
}
private function buildParam() {
$param = array(
'header' => array(
'app_id' => $this->APPID,
'status' => 3,
'res_id' => $this->RES_ID
),
'parameter' => array(
'its' => array(
'from' => $this->FROM,
'to' => $this->TO,
'result' => new stdClass() // 空对象
)
),
'payload' => array(
'input_data' => array(
'encoding' => 'utf8',
'status' => 3,
'text' => base64_encode($this->TEXT)
)
)
);
return json_encode($param);
}
}
// 处理标签内容的代码
if ($LabelArray['PageType'] == "Save") {
// 新增翻译逻辑
$translator = new MachineTranslation();
// 翻译标题
$translatedTitle = $translator->translateText($LabelArray['标题']);
$LabelArray['标题'] = $translatedTitle;
// 翻译内容
$translatedContent = $translator->translateText($LabelArray['内容']);
$LabelArray['内容'] = $translatedContent;
// 记录时间
$LabelArray['时间'] = date('Y-m-d H:i:s', time());
}
// 输出序列化的结果
echo serialize($LabelArray);
其实本来这样已经能处理很多内容了~而且我发现多数时候没有消费讯飞的字符量!
但是!!
采集的如果是短片小说啥的字数有个几万字呢?处理办法就是根据采集的内容中的<p>进行内容分割,然后再组合到差不多字数了去翻译,就这样,最后把所有的翻译内容组合起来。就有了下面这个V3版本
<?php
error_reporting(E_ERROR | E_WARNING | E_PARSE);
class MachineTranslation {
private $requestUrl = "https://itrans.xf-yun.com/v1/its";
private $APPID = "写自己的";
private $APISecret = "写自己的";
private $APIKey = "写自己的";
private $RES_ID = "its_en_cn_word";
private $FROM = "cn"; // 源语种
private $TO = "en"; // 目标语种
private $TEXT;
public function translateText($text) {
$this->TEXT = $text;
return $this->main();
}
public function main() {
$startTime = microtime(true);
try {
$resp = $this->doRequest();
$respArray = json_decode($resp, true);
$textBase64Decode = base64_decode($respArray['payload']['result']['text']);
$textArray = json_decode($textBase64Decode, true);
return $textArray['trans_result']['dst']; // 返回翻译后的文本
} catch (Exception $e) {
file_put_contents('error_log.txt', "Error: " . $e->getMessage() . "\n", FILE_APPEND);
return "Error: " . $e->getMessage();
}
}
private function doRequest() {
$url = $this->buildRequetUrl();
$params = $this->buildParam();
$ch = curl_init();
curl_setopt($ch, CURLOPT_TIMEOUT, 60); // 设置超时为60秒
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30); // 设置连接超时为30秒
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, array('Content-Type: application/json'));
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, $params);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
$response = curl_exec($ch);
if (curl_errno($ch)) {
throw new Exception("Curl error: " . curl_error($ch));
}
curl_close($ch);
return $response;
}
private function buildRequetUrl() {
$url = parse_url($this->requestUrl);
$date = gmdate("D, d M Y H:i:s T");
$host = $url['host'];
$request_line = "POST " . $url['path'] . " HTTP/1.1";
$string_to_sign = "host: $host\n" . "date: $date\n" . $request_line;
$signature = base64_encode(hash_hmac('sha256', $string_to_sign, $this->APISecret, true));
$authorization = sprintf('api_key="%s", algorithm="hmac-sha256", headers="host date request-line", signature="%s"', $this->APIKey, $signature);
$authBase = base64_encode($authorization);
return sprintf("%s?authorization=%s&host=%s&date=%s",
$this->requestUrl,
urlencode($authBase),
urlencode($host),
urlencode($date)
);
}
private function buildParam() {
$param = array(
'header' => array(
'app_id' => $this->APPID,
'status' => 3,
'res_id' => $this->RES_ID
),
'parameter' => array(
'its' => array(
'from' => $this->FROM,
'to' => $this->TO,
'result' => new stdClass() // 空对象
)
),
'payload' => array(
'input_data' => array(
'encoding' => 'utf8',
'status' => 3,
'text' => base64_encode($this->TEXT)
)
)
);
return json_encode($param);
}
}
// 处理标签内容的代码
if ($LabelArray['PageType'] == "Save") {
// 新增翻译逻辑
$translator = new MachineTranslation();
// 翻译标题
$translatedTitle = $translator->translateText($LabelArray['标题']);
$LabelArray['标题'] = $translatedTitle;
// 翻译内容
$maxLength = 9000;
$content = $LabelArray['内容'];
if (strlen($content) > $maxLength) {
// 使用正则表达式分割内容,适应多种标点符号
$segments = preg_split('/<\/p>\s*/', $content, -1, PREG_SPLIT_NO_EMPTY);
$translatedSegments = array(); // 使用 array() 替代 []
$currentSegment = '';
foreach ($segments as $segment) {
// 如果当前段落加上新段落超过限制,则翻译当前段落
if (strlen($currentSegment) + strlen($segment) > $maxLength) {
$translatedSegments[] = $translator->translateText(trim( $currentSegment));
$currentSegment = $segment; // 开始新的段落
} else {
$currentSegment .= $segment; // 添加段落内容
}
}
// 翻译最后一个段落
$trimmedCurrentSegment = trim($currentSegment);
if (!empty($trimmedCurrentSegment)) {
$translatedSegments[] = $translator->translateText($trimmedCurrentSegment);
}
// 拼接翻译后的段落
$LabelArray['内容'] = implode(' ', $translatedSegments);
} else {
// 内容小于等于5000字,直接翻译
$LabelArray['内容'] = $translator->translateText($content);
}
// 记录时间
$LabelArray['时间'] = date('Y-m-d H:i:s', time());
}
// 输出序列化的结果
echo serialize($LabelArray);
好了!这是我讯飞的终极版本!然后一晚上把我免费的字符量给用爆了!!!!买起来又贵!放弃!弃坑!!!!!